diff --git a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
index 231e565f27e52..86443a155069e 100644
--- a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
@@ -86,15 +86,17 @@ MagicNumbersCheck::MagicNumbersCheck(StringRef Name, ClangTidyContext *Context)
     IgnoredDoublePointValues.reserve(IgnoredFloatingPointValuesInput.size());
     for (const auto &InputValue : IgnoredFloatingPointValuesInput) {
       llvm::APFloat FloatValue(llvm::APFloat::IEEEsingle());
-      if (!FloatValue.convertFromString(InputValue, DefaultRoundingMode)) {
-        assert(false && "Invalid floating point representation");
-      }
+      auto StatusOrErr =
+          FloatValue.convertFromString(InputValue, DefaultRoundingMode);
+      assert(StatusOrErr && "Invalid floating point representation");
+      consumeError(StatusOrErr.takeError());
       IgnoredFloatingPointValues.push_back(FloatValue.convertToFloat());
 
       llvm::APFloat DoubleValue(llvm::APFloat::IEEEdouble());
-      if (!DoubleValue.convertFromString(InputValue, DefaultRoundingMode)) {
-        assert(false && "Invalid floating point representation");
-      }
+      StatusOrErr =
+          DoubleValue.convertFromString(InputValue, DefaultRoundingMode);
+      assert(StatusOrErr && "Invalid floating point representation");
+      consumeError(StatusOrErr.takeError());
       IgnoredDoublePointValues.push_back(DoubleValue.convertToDouble());
     }
     llvm::sort(IgnoredFloatingPointValues.begin(),
diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp
index b1eca02813b38..f11b1236803c7 100644
--- a/clang-tools-extra/clangd/CompileCommands.cpp
+++ b/clang-tools-extra/clangd/CompileCommands.cpp
@@ -155,7 +155,9 @@ void CommandMangler::adjust(std::vector<std::string> &Cmd) const {
   if (ResourceDir && !Has("-resource-dir"))
     Cmd.push_back(("-resource-dir=" + *ResourceDir));
 
-  if (Sysroot && !Has("-isysroot")) {
+  // Don't set `-isysroot` if it is already set or if `--sysroot` is set.
+  // `--sysroot` is a superset of the `-isysroot` argument.
+  if (Sysroot && !Has("-isysroot") && !Has("--sysroot")) {
     Cmd.push_back("-isysroot");
     Cmd.push_back(*Sysroot);
   }
diff --git a/clang-tools-extra/clangd/Diagnostics.cpp b/clang-tools-extra/clangd/Diagnostics.cpp
index e78df0322eb32..ad8f6c8bef9a1 100644
--- a/clang-tools-extra/clangd/Diagnostics.cpp
+++ b/clang-tools-extra/clangd/Diagnostics.cpp
@@ -22,6 +22,8 @@
 #include "clang/Lex/Token.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -328,14 +330,22 @@ CodeAction toCodeAction(const Fix &F, const URIForFile &File) {
 void toLSPDiags(
     const Diag &D, const URIForFile &File, const ClangdDiagnosticOptions &Opts,
     llvm::function_ref<void(clangd::Diagnostic, llvm::ArrayRef<Fix>)> OutFn) {
-  auto FillBasicFields = [](const DiagBase &D) -> clangd::Diagnostic {
-    clangd::Diagnostic Res;
-    Res.range = D.Range;
-    Res.severity = getSeverity(D.Severity);
-    return Res;
-  };
+  clangd::Diagnostic Main;
+  Main.severity = getSeverity(D.Severity);
+
+  // Main diagnostic should always refer to a range inside main file. If a
+  // diagnostic made it so for, it means either itself or one of its notes is
+  // inside main file.
+  if (D.InsideMainFile) {
+    Main.range = D.Range;
+  } else {
+    auto It =
+        llvm::find_if(D.Notes, [](const Note &N) { return N.InsideMainFile; });
+    assert(It != D.Notes.end() &&
+           "neither the main diagnostic nor notes are inside main file");
+    Main.range = It->Range;
+  }
 
-  clangd::Diagnostic Main = FillBasicFields(D);
   Main.code = D.Name;
   switch (D.Source) {
   case Diag::Clang:
@@ -379,7 +389,9 @@ void toLSPDiags(
     for (auto &Note : D.Notes) {
       if (!Note.InsideMainFile)
         continue;
-      clangd::Diagnostic Res = FillBasicFields(Note);
+      clangd::Diagnostic Res;
+      Res.severity = getSeverity(Note.Severity);
+      Res.range = Note.Range;
       Res.message = noteMessage(D, Note, Opts);
       OutFn(std::move(Res), llvm::ArrayRef<Fix>());
     }
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index b1a2e289eed79..20883b347fdc4 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -25,9 +25,11 @@
 #include "clang/AST/PrettyPrinter.h"
 #include "clang/Index/IndexSymbol.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
+#include <string>
 
 namespace clang {
 namespace clangd {
@@ -224,8 +226,8 @@ void enhanceFromIndex(HoverInfo &Hover, const NamedDecl &ND,
 
 // Populates Type, ReturnType, and Parameters for function-like decls.
 void fillFunctionTypeAndParams(HoverInfo &HI, const Decl *D,
-                                      const FunctionDecl *FD,
-                                      const PrintingPolicy &Policy) {
+                               const FunctionDecl *FD,
+                               const PrintingPolicy &Policy) {
   HI.Parameters.emplace();
   for (const ParmVarDecl *PVD : FD->parameters()) {
     HI.Parameters->emplace_back();
@@ -250,11 +252,11 @@ void fillFunctionTypeAndParams(HoverInfo &HI, const Decl *D,
     }
   }
 
-  if (const auto* CCD = llvm::dyn_cast<CXXConstructorDecl>(FD)) {
+  if (const auto *CCD = llvm::dyn_cast<CXXConstructorDecl>(FD)) {
     // Constructor's "return type" is the class type.
     HI.ReturnType = declaredType(CCD->getParent()).getAsString(Policy);
     // Don't provide any type for the constructor itself.
-  } else if (llvm::isa<CXXDestructorDecl>(FD)){
+  } else if (llvm::isa<CXXDestructorDecl>(FD)) {
     HI.ReturnType = "void";
   } else {
     HI.ReturnType = FD->getReturnType().getAsString(Policy);
@@ -309,7 +311,7 @@ llvm::Optional<std::string> printExprValue(const SelectionTree::Node *N,
 }
 
 /// Generate a \p Hover object given the declaration \p D.
-HoverInfo getHoverContents(const Decl *D, const SymbolIndex *Index) {
+HoverInfo getHoverContents(const NamedDecl *D, const SymbolIndex *Index) {
   HoverInfo HI;
   const ASTContext &Ctx = D->getASTContext();
 
@@ -321,12 +323,10 @@ HoverInfo getHoverContents(const Decl *D, const SymbolIndex *Index) {
     HI.LocalScope.append("::");
 
   PrintingPolicy Policy = printingPolicyForDecls(Ctx.getPrintingPolicy());
-  if (const NamedDecl *ND = llvm::dyn_cast<NamedDecl>(D)) {
-    HI.Name = printName(Ctx, *ND);
-    ND = getDeclForComment(ND);
-    HI.Documentation = getDeclComment(Ctx, *ND);
-    enhanceFromIndex(HI, *ND, Index);
-  }
+  HI.Name = printName(Ctx, *D);
+  const auto *CommentD = getDeclForComment(D);
+  HI.Documentation = getDeclComment(Ctx, *CommentD);
+  enhanceFromIndex(HI, *CommentD, Index);
 
   HI.Kind = index::getSymbolInfo(D).Kind;
 
@@ -460,34 +460,70 @@ llvm::Optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
           tooling::applyAllReplacements(HI->Definition, Replacements))
     HI->Definition = *Formatted;
 
-  HI->SymRange = getTokenRange(AST.getSourceManager(),
-                               AST.getLangOpts(), SourceLocationBeg);
+  HI->SymRange = getTokenRange(AST.getSourceManager(), AST.getLangOpts(),
+                               SourceLocationBeg);
   return HI;
 }
 
 markup::Document HoverInfo::present() const {
   markup::Document Output;
-  if (NamespaceScope) {
-    auto &P = Output.addParagraph();
-    P.appendText("Declared in");
-    // Drop trailing "::".
-    if (!LocalScope.empty())
-      P.appendCode(llvm::StringRef(LocalScope).drop_back(2));
-    else if (NamespaceScope->empty())
-      P.appendCode("global namespace");
-    else
-      P.appendCode(llvm::StringRef(*NamespaceScope).drop_back(2));
+  // Header contains a text of the form:
+  // variable `var` : `int`
+  //
+  // class `X`
+  //
+  // function `foo` → `int`
+  markup::Paragraph &Header = Output.addParagraph();
+  Header.appendText(index::getSymbolKindString(Kind));
+  assert(!Name.empty() && "hover triggered on a nameless symbol");
+  Header.appendCode(Name);
+  if (ReturnType) {
+    Header.appendText("→");
+    Header.appendCode(*ReturnType);
+  } else if (Type) {
+    Header.appendText(":");
+    Header.appendCode(*Type);
   }
 
-  if (!Definition.empty()) {
-    Output.addCodeBlock(Definition);
-  } else {
-    // Builtin types
-    Output.addCodeBlock(Name);
+  // For functions we display signature in a list form, e.g.:
+  // - `bool param1`
+  // - `int param2 = 5`
+  if (Parameters && !Parameters->empty()) {
+    markup::BulletList &L = Output.addBulletList();
+    for (const auto &Param : *Parameters) {
+      std::string Buffer;
+      llvm::raw_string_ostream OS(Buffer);
+      OS << Param;
+      L.addItem().addParagraph().appendCode(std::move(OS.str()));
+    }
+  }
+
+  if (Value) {
+    markup::Paragraph &P = Output.addParagraph();
+    P.appendText("Value =");
+    P.appendCode(*Value);
   }
 
   if (!Documentation.empty())
     Output.addParagraph().appendText(Documentation);
+
+  if (!Definition.empty()) {
+    std::string ScopeComment;
+    // Drop trailing "::".
+    if (!LocalScope.empty()) {
+      // Container name, e.g. class, method, function.
+      // We might want to propogate some info about container type to print
+      // function foo, class X, method X::bar, etc.
+      ScopeComment =
+          "// In " + llvm::StringRef(LocalScope).rtrim(':').str() + '\n';
+    } else if (NamespaceScope && !NamespaceScope->empty()) {
+      ScopeComment = "// In namespace " +
+                     llvm::StringRef(*NamespaceScope).rtrim(':').str() + '\n';
+    }
+    // Note that we don't print anything for global namespace, to not annoy
+    // non-c++ projects or projects that are not making use of namespaces.
+    Output.addCodeBlock(ScopeComment + Definition);
+  }
   return Output;
 }
 
diff --git a/clang-tools-extra/clangd/test/hover.test b/clang-tools-extra/clangd/test/hover.test
index e45164b346ea5..2162ff9abcdc3 100644
--- a/clang-tools-extra/clangd/test/hover.test
+++ b/clang-tools-extra/clangd/test/hover.test
@@ -9,7 +9,7 @@
 # CHECK-NEXT:  "result": {
 # CHECK-NEXT:    "contents": {
 # CHECK-NEXT:      "kind": "plaintext",
-# CHECK-NEXT:      "value": "Declared in global namespace\n\nvoid foo()"
+# CHECK-NEXT:      "value": "function foo → void\n\nvoid foo()"
 # CHECK-NEXT:    },
 # CHECK-NEXT:    "range": {
 # CHECK-NEXT:      "end": {
@@ -37,7 +37,7 @@
 # CHECK-NEXT:  "result": {
 # CHECK-NEXT:    "contents": {
 # CHECK-NEXT:      "kind": "plaintext",
-# CHECK-NEXT:      "value": "Declared in global namespace\n\nenum foo {}"
+# CHECK-NEXT:      "value": "enum foo\n\nenum foo {}"
 # CHECK-NEXT:    },
 # CHECK-NEXT:    "range": {
 # CHECK-NEXT:      "end": {
diff --git a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
index 0941af25213ca..ef73519ef1385 100644
--- a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
@@ -1014,6 +1014,29 @@ TEST(IgnoreDiags, FromNonWrittenInclude) {
   EXPECT_THAT(TU.build().getDiagnostics(), UnorderedElementsAre());
 }
 
+TEST(ToLSPDiag, RangeIsInMain) {
+  ClangdDiagnosticOptions Opts;
+  clangd::Diag D;
+  D.Range = {pos(1, 2), pos(3, 4)};
+  D.Notes.emplace_back();
+  Note &N = D.Notes.back();
+  N.Range = {pos(2, 3), pos(3, 4)};
+
+  D.InsideMainFile = true;
+  N.InsideMainFile = false;
+  toLSPDiags(D, {}, Opts,
+             [&](clangd::Diagnostic LSPDiag, ArrayRef<clangd::Fix>) {
+               EXPECT_EQ(LSPDiag.range, D.Range);
+             });
+
+  D.InsideMainFile = false;
+  N.InsideMainFile = true;
+  toLSPDiags(D, {}, Opts,
+             [&](clangd::Diagnostic LSPDiag, ArrayRef<clangd::Fix>) {
+               EXPECT_EQ(LSPDiag.range, N.Range);
+             });
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 4c6d2abbd24a1..44337688ff87a 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -1606,6 +1606,95 @@ TEST(Hover, DocsFromMostSpecial) {
     }
   }
 }
+TEST(Hover, Present) {
+  struct {
+    const std::function<void(HoverInfo &)> Builder;
+    llvm::StringRef ExpectedRender;
+  } Cases[] = {
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::Unknown;
+            HI.Name = "X";
+          },
+          R"(<unknown> X)",
+      },
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::NamespaceAlias;
+            HI.Name = "foo";
+          },
+          R"(namespace-alias foo)",
+      },
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::Class;
+            HI.TemplateParameters = {
+                {std::string("typename"), std::string("T"), llvm::None},
+                {std::string("typename"), std::string("C"),
+                 std::string("bool")},
+            };
+            HI.Documentation = "documentation";
+            HI.Definition =
+                "template <typename T, typename C = bool> class Foo {}";
+            HI.Name = "foo";
+            HI.NamespaceScope.emplace();
+          },
+          R"(class foo
+documentation
+
+template <typename T, typename C = bool> class Foo {})",
+      },
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::Function;
+            HI.Name = "foo";
+            HI.Type = "type";
+            HI.ReturnType = "ret_type";
+            HI.Parameters.emplace();
+            HoverInfo::Param P;
+            HI.Parameters->push_back(P);
+            P.Type = "type";
+            HI.Parameters->push_back(P);
+            P.Name = "foo";
+            HI.Parameters->push_back(P);
+            P.Default = "default";
+            HI.Parameters->push_back(P);
+            HI.NamespaceScope = "ns::";
+            HI.Definition = "ret_type foo(params) {}";
+          },
+          R"(function foo → ret_type
+- 
+- type
+- type foo
+- type foo = default
+
+// In namespace ns
+ret_type foo(params) {})",
+      },
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::Variable;
+            HI.LocalScope = "test::bar::";
+            HI.Value = "value";
+            HI.Name = "foo";
+            HI.Type = "type";
+            HI.Definition = "def";
+          },
+          R"(variable foo : type
+Value = value
+
+// In test::bar
+def)",
+      },
+  };
+
+  for (const auto &C : Cases) {
+    HoverInfo HI;
+    C.Builder(HI);
+    EXPECT_EQ(HI.present().asPlainText(), C.ExpectedRender);
+  }
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-misleading-indentation.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-misleading-indentation.cpp
index c3bd33d8ee7b8..aea0618d120db 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability-misleading-indentation.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability-misleading-indentation.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s readability-misleading-indentation %t
+// RUN: %check_clang_tidy %s readability-misleading-indentation %t -- -- -fno-delayed-template-parsing
 
 void foo1();
 void foo2();
@@ -168,6 +168,17 @@ void mustFailNonTemplate() {
   // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: different indentation for 'if' and corresponding 'else' [readability-misleading-indentation]
 }
 
+template<bool b>
+void mustFailNoInsta() {
+  if constexpr (b) {
+    foo1();
+  }
+    else {
+      foo2();
+      // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: different indentation for 'if' and corresponding 'else' [readability-misleading-indentation]
+  }
+}
+
 template<bool b>
 void mustPassNoInsta() {
   if constexpr (b) {
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index f55ffccc84d05..856d5e34bbcc2 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2661,7 +2661,8 @@ This will produce a generic test.bc file that can be used in vendor toolchains
 to perform machine code generation.
 
 Clang currently supports OpenCL C language standards up to v2.0. Starting from
-clang 9 a C++ mode is available for OpenCL (see :ref:`C++ for OpenCL <opencl_cpp>`).
+clang 9 a C++ mode is available for OpenCL (see
+:ref:`C++ for OpenCL <cxx_for_opencl>`).
 
 OpenCL Specific Options
 -----------------------
@@ -3024,7 +3025,7 @@ There are some standard OpenCL functions that are implemented as Clang builtins:
   enqueue query functions from `section 6.13.17.5
   <https://www.khronos.org/registry/cl/specs/opencl-2.0-openclc.pdf#171>`_.
 
-.. _opencl_cpp:
+.. _cxx_for_opencl:
 
 C++ for OpenCL
 --------------
diff --git a/clang/include/clang/AST/ASTLambda.h b/clang/include/clang/AST/ASTLambda.h
index c1153168e41bb..6fd82d6af4908 100644
--- a/clang/include/clang/AST/ASTLambda.h
+++ b/clang/include/clang/AST/ASTLambda.h
@@ -64,6 +64,17 @@ inline bool isGenericLambdaCallOperatorSpecialization(DeclContext *DC) {
                                           dyn_cast<CXXMethodDecl>(DC));
 }
 
+inline bool isGenericLambdaCallOperatorOrStaticInvokerSpecialization(
+    DeclContext *DC) {
+  CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(DC);
+  if (!MD) return false;
+  const CXXRecordDecl *LambdaClass = MD->getParent();
+  if (LambdaClass && LambdaClass->isGenericLambda())
+    return (isLambdaCallOperator(MD) || MD->isLambdaStaticInvoker()) &&
+                    MD->isFunctionTemplateSpecialization();
+  return false;
+}
+
 
 // This returns the parent DeclContext ensuring that the correct
 // parent DeclContext is returned for Lambdas
diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h
index 9dab814b659ba..a672d92695da9 100644
--- a/clang/include/clang/AST/ASTNodeTraverser.h
+++ b/clang/include/clang/AST/ASTNodeTraverser.h
@@ -384,6 +384,9 @@ class ASTNodeTraverser
       for (const auto *Parameter : D->parameters())
         Visit(Parameter);
 
+    if (const Expr *TRC = D->getTrailingRequiresClause())
+      Visit(TRC);
+
     if (const auto *C = dyn_cast<CXXConstructorDecl>(D))
       for (const auto *I : C->inits())
         Visit(I);
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index cd97c6dcf8d5c..002d1434b1cbd 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -669,10 +669,12 @@ struct QualifierInfo {
 /// Represents a ValueDecl that came out of a declarator.
 /// Contains type source information through TypeSourceInfo.
 class DeclaratorDecl : public ValueDecl {
-  // A struct representing both a TInfo and a syntactic qualifier,
-  // to be used for the (uncommon) case of out-of-line declarations.
+  // A struct representing a TInfo, a trailing requires-clause and a syntactic
+  // qualifier, to be used for the (uncommon) case of out-of-line declarations
+  // and constrained function decls.
   struct ExtInfo : public QualifierInfo {
     TypeSourceInfo *TInfo;
+    Expr *TrailingRequiresClause = nullptr;
   };
 
   llvm::PointerUnion<TypeSourceInfo *, ExtInfo *> DeclInfo;
@@ -739,6 +741,21 @@ class DeclaratorDecl : public ValueDecl {
 
   void setQualifierInfo(NestedNameSpecifierLoc QualifierLoc);
 
+  /// \brief Get the constraint-expression introduced by the trailing
+  /// requires-clause in the function/member declaration, or null if no
+  /// requires-clause was provided.
+  Expr *getTrailingRequiresClause() {
+    return hasExtInfo() ? getExtInfo()->TrailingRequiresClause
+                        : nullptr;
+  }
+
+  const Expr *getTrailingRequiresClause() const {
+    return hasExtInfo() ? getExtInfo()->TrailingRequiresClause
+                        : nullptr;
+  }
+
+  void setTrailingRequiresClause(Expr *TrailingRequiresClause);
+
   unsigned getNumTemplateParameterLists() const {
     return hasExtInfo() ? getExtInfo()->NumTemplParamLists : 0;
   }
@@ -1903,7 +1920,8 @@ class FunctionDecl : public DeclaratorDecl,
   FunctionDecl(Kind DK, ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
                const DeclarationNameInfo &NameInfo, QualType T,
                TypeSourceInfo *TInfo, StorageClass S, bool isInlineSpecified,
-               ConstexprSpecKind ConstexprKind);
+               ConstexprSpecKind ConstexprKind,
+               Expr *TrailingRequiresClause = nullptr);
 
   using redeclarable_base = Redeclarable<FunctionDecl>;
 
@@ -1938,11 +1956,12 @@ class FunctionDecl : public DeclaratorDecl,
          SourceLocation NLoc, DeclarationName N, QualType T,
          TypeSourceInfo *TInfo, StorageClass SC, bool isInlineSpecified = false,
          bool hasWrittenPrototype = true,
-         ConstexprSpecKind ConstexprKind = CSK_unspecified) {
+         ConstexprSpecKind ConstexprKind = CSK_unspecified,
+         Expr *TrailingRequiresClause = nullptr) {
     DeclarationNameInfo NameInfo(N, NLoc);
     return FunctionDecl::Create(C, DC, StartLoc, NameInfo, T, TInfo, SC,
                                 isInlineSpecified, hasWrittenPrototype,
-                                ConstexprKind);
+                                ConstexprKind, TrailingRequiresClause);
   }
 
   static FunctionDecl *Create(ASTContext &C, DeclContext *DC,
@@ -1950,7 +1969,8 @@ class FunctionDecl : public DeclaratorDecl,
                               const DeclarationNameInfo &NameInfo, QualType T,
                               TypeSourceInfo *TInfo, StorageClass SC,
                               bool isInlineSpecified, bool hasWrittenPrototype,
-                              ConstexprSpecKind ConstexprKind);
+                              ConstexprSpecKind ConstexprKind,
+                              Expr *TrailingRequiresClause);
 
   static FunctionDecl *CreateDeserialized(ASTContext &C, unsigned ID);
 
@@ -2352,6 +2372,17 @@ class FunctionDecl : public DeclaratorDecl,
   /// the target functionality.
   bool isTargetMultiVersion() const;
 
+  /// \brief Get the associated-constraints of this function declaration.
+  /// Currently, this will either be a vector of size 1 containing the
+  /// trailing-requires-clause or an empty vector.
+  ///
+  /// Use this instead of getTrailingRequiresClause for concepts APIs that
+  /// accept an ArrayRef of constraint expressions.
+  void getAssociatedConstraints(SmallVectorImpl<const Expr *> &AC) const {
+    if (auto *TRC = getTrailingRequiresClause())
+      AC.push_back(TRC);
+  }
+
   void setPreviousDeclaration(FunctionDecl * PrevDecl);
 
   FunctionDecl *getCanonicalDecl() override;
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 1c6f99438fc3e..aba33e383976c 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -1905,9 +1905,10 @@ class CXXMethodDecl : public FunctionDecl {
                 SourceLocation StartLoc, const DeclarationNameInfo &NameInfo,
                 QualType T, TypeSourceInfo *TInfo, StorageClass SC,
                 bool isInline, ConstexprSpecKind ConstexprKind,
-                SourceLocation EndLocation)
+                SourceLocation EndLocation,
+                Expr *TrailingRequiresClause = nullptr)
       : FunctionDecl(DK, C, RD, StartLoc, NameInfo, T, TInfo, SC, isInline,
-                     ConstexprKind) {
+                     ConstexprKind, TrailingRequiresClause) {
     if (EndLocation.isValid())
       setRangeEnd(EndLocation);
   }
@@ -1918,7 +1919,8 @@ class CXXMethodDecl : public FunctionDecl {
                                const DeclarationNameInfo &NameInfo, QualType T,
                                TypeSourceInfo *TInfo, StorageClass SC,
                                bool isInline, ConstexprSpecKind ConstexprKind,
-                               SourceLocation EndLocation);
+                               SourceLocation EndLocation,
+                               Expr *TrailingRequiresClause = nullptr);
 
   static CXXMethodDecl *CreateDeserialized(ASTContext &C, unsigned ID);
 
@@ -2363,7 +2365,8 @@ class CXXConstructorDecl final
                      const DeclarationNameInfo &NameInfo, QualType T,
                      TypeSourceInfo *TInfo, ExplicitSpecifier ES, bool isInline,
                      bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind,
-                     InheritedConstructor Inherited);
+                     InheritedConstructor Inherited,
+                     Expr *TrailingRequiresClause);
 
   void anchor() override;
 
@@ -2416,7 +2419,8 @@ class CXXConstructorDecl final
          const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
          ExplicitSpecifier ES, bool isInline, bool isImplicitlyDeclared,
          ConstexprSpecKind ConstexprKind,
-         InheritedConstructor Inherited = InheritedConstructor());
+         InheritedConstructor Inherited = InheritedConstructor(),
+         Expr *TrailingRequiresClause = nullptr);
 
   ExplicitSpecifier getExplicitSpecifier() {
     return getCanonicalDecl()->getExplicitSpecifierInternal();
@@ -2623,9 +2627,11 @@ class CXXDestructorDecl : public CXXMethodDecl {
   CXXDestructorDecl(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
                     const DeclarationNameInfo &NameInfo, QualType T,
                     TypeSourceInfo *TInfo, bool isInline,
-                    bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind)
+                    bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind,
+                    Expr *TrailingRequiresClause = nullptr)
       : CXXMethodDecl(CXXDestructor, C, RD, StartLoc, NameInfo, T, TInfo,
-                      SC_None, isInline, ConstexprKind, SourceLocation()) {
+                      SC_None, isInline, ConstexprKind, SourceLocation(),
+                      TrailingRequiresClause) {
     setImplicit(isImplicitlyDeclared);
   }
 
@@ -2637,7 +2643,8 @@ class CXXDestructorDecl : public CXXMethodDecl {
                                    const DeclarationNameInfo &NameInfo,
                                    QualType T, TypeSourceInfo *TInfo,
                                    bool isInline, bool isImplicitlyDeclared,
-                                   ConstexprSpecKind ConstexprKind);
+                                   ConstexprSpecKind ConstexprKind,
+                                   Expr *TrailingRequiresClause = nullptr);
   static CXXDestructorDecl *CreateDeserialized(ASTContext & C, unsigned ID);
 
   void setOperatorDelete(FunctionDecl *OD, Expr *ThisArg);
@@ -2676,9 +2683,11 @@ class CXXConversionDecl : public CXXMethodDecl {
   CXXConversionDecl(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
                     const DeclarationNameInfo &NameInfo, QualType T,
                     TypeSourceInfo *TInfo, bool isInline, ExplicitSpecifier ES,
-                    ConstexprSpecKind ConstexprKind, SourceLocation EndLocation)
+                    ConstexprSpecKind ConstexprKind, SourceLocation EndLocation,
+                    Expr *TrailingRequiresClause = nullptr)
       : CXXMethodDecl(CXXConversion, C, RD, StartLoc, NameInfo, T, TInfo,
-                      SC_None, isInline, ConstexprKind, EndLocation),
+                      SC_None, isInline, ConstexprKind, EndLocation,
+                      TrailingRequiresClause),
         ExplicitSpec(ES) {}
   void anchor() override;
 
@@ -2694,7 +2703,7 @@ class CXXConversionDecl : public CXXMethodDecl {
   Create(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
          const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
          bool isInline, ExplicitSpecifier ES, ConstexprSpecKind ConstexprKind,
-         SourceLocation EndLocation);
+         SourceLocation EndLocation, Expr *TrailingRequiresClause = nullptr);
   static CXXConversionDecl *CreateDeserialized(ASTContext &C, unsigned ID);
 
   ExplicitSpecifier getExplicitSpecifier() {
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 19dd62b0fe0fd..144ef221d6920 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2030,6 +2030,11 @@ bool RecursiveASTVisitor<Derived>::TraverseFunctionHelper(FunctionDecl *D) {
     }
   }
 
+  // Visit the trailing requires clause, if any.
+  if (Expr *TrailingRequiresClause = D->getTrailingRequiresClause()) {
+    TRY_TO(TraverseStmt(TrailingRequiresClause));
+  }
+
   if (CXXConstructorDecl *Ctor = dyn_cast<CXXConstructorDecl>(D)) {
     // Constructor initializers.
     for (auto *I : Ctor->inits()) {
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index af7fb92d230d6..cae44fb274a4a 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -178,8 +178,8 @@ class FunctionArgument<string name, bit opt = 0, bit fake = 0> : Argument<name,
                                                                           opt,
                                                                           fake>;
 class NamedArgument<string name, bit opt = 0, bit fake = 0> : Argument<name,
-                                                                          opt,
-                                                                          fake>;
+                                                                       opt,
+                                                                       fake>;
 class TypeArgument<string name, bit opt = 0> : Argument<name, opt>;
 class UnsignedArgument<string name, bit opt = 0> : Argument<name, opt>;
 class VariadicUnsignedArgument<string name> : Argument<name, 1>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 997a3c3d84dcc..71f2ea8cd0df5 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -4689,7 +4689,7 @@ below. The explicit attribute annotation indicates that the third parameter
 (`start_routine`) is called zero or more times by the `pthread_create` function,
 and that the fourth parameter (`arg`) is passed along. Note that the callback
 behavior of `pthread_create` is automatically recognized by Clang. In addition,
-the declarations of `__kmpc_fork_teams` and `__kmpc_fork_call`, generated for 
+the declarations of `__kmpc_fork_teams` and `__kmpc_fork_call`, generated for
 `#pragma omp target teams` and `#pragma omp parallel`, respectively, are also
 automatically recognized as broker functions. Further functions might be added
 in the future.
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index c8c1d73cb5d9e..a4f8300aa38e1 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -984,6 +984,7 @@ LIBBUILTIN(longjmp, "vJi",        "fr",    "setjmp.h", ALL_LANGUAGES)
 LIBBUILTIN(alloca, "v*z",         "f",     "stdlib.h", ALL_GNU_LANGUAGES)
 // POSIX string.h
 LIBBUILTIN(memccpy, "v*v*vC*iz",  "f",     "string.h", ALL_GNU_LANGUAGES)
+LIBBUILTIN(mempcpy, "v*v*vC*z",   "f",     "string.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(stpcpy, "c*c*cC*",     "f",     "string.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(stpncpy, "c*c*cC*z",   "f",     "string.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(strdup, "c*cC*",       "f",     "string.h", ALL_GNU_LANGUAGES)
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 0a8484d983f33..0d9d31670157d 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -181,6 +181,13 @@ def err_function_declared_typedef : Error<
 def err_at_defs_cxx : Error<"@defs is not supported in Objective-C++">;
 def err_at_in_class : Error<"unexpected '@' in member specification">;
 def err_unexpected_semi : Error<"unexpected ';' before %0">;
+def err_unparenthesized_non_primary_expr_in_requires_clause : Error<
+  "parentheses are required around this expression in a requires clause">;
+def note_unparenthesized_non_primary_expr_in_requires_clause : Note<
+  "parentheses are required around this expression in a requires clause">;
+def err_potential_function_call_in_constraint_logical_or : Error<
+  "function call must be parenthesized to be considered part of the requires "
+  "clause">;
 
 def err_expected_fn_body : Error<
   "expected function body after function declarator">;
@@ -309,6 +316,12 @@ def err_init_list_bin_op : Error<"initializer list cannot be used on the "
 def warn_cxx98_compat_trailing_return_type : Warning<
   "trailing return types are incompatible with C++98">,
   InGroup<CXX98Compat>, DefaultIgnore;
+def err_requires_clause_must_appear_after_trailing_return : Error<
+  "trailing return type must appear before trailing requires clause">;
+def err_requires_clause_on_declarator_not_declaring_a_function : Error<
+  "trailing requires clause can only be used when declaring a function">;
+def err_requires_clause_inside_parens : Error<
+  "trailing requires clause should be placed outside parentheses">;
 def ext_auto_storage_class : ExtWarn<
   "'auto' storage class specifier is not permitted in C++11, and will not "
   "be supported in future releases">, InGroup<DiagGroup<"auto-storage-class">>;
@@ -880,7 +893,7 @@ def warn_cxx98_compat_lambda : Warning<
   InGroup<CXX98Compat>, DefaultIgnore;
 def err_lambda_missing_parens : Error<
   "lambda requires '()' before %select{'mutable'|return type|"
-  "attribute specifier|'constexpr'|'consteval'}0">;
+  "attribute specifier|'constexpr'|'consteval'|'requires' clause}0">;
 def err_lambda_decl_specifier_repeated : Error<
   "%select{'mutable'|'constexpr'|'consteval'}0 cannot appear multiple times in a lambda declarator">;
 def err_lambda_capture_misplaced_ellipsis : Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5fa7d23c567cc..f82cdadd9ff47 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -2618,6 +2618,18 @@ def note_single_arg_concept_specialization_constraint_evaluated_to_false : Note<
   "%select{and |because }0%1 does not satisfy %2">;
 def note_atomic_constraint_evaluated_to_false_elaborated : Note<
   "%select{and |because }0'%1' (%2 %3 %4) evaluated to false">;
+def err_constrained_virtual_method : Error<
+  "virtual function cannot have a requires clause">;
+def err_trailing_requires_clause_on_deduction_guide : Error<
+  "deduction guide cannot have a requires clause">;
+def err_reference_to_function_with_unsatisfied_constraints : Error<
+  "invalid reference to function %0: constraints not satisfied">;
+def note_ambiguous_atomic_constraints : Note<
+  "similar constraint expressions not considered equivalent; constraint "
+  "expressions cannot be considered equivalent unless they originate from the "
+  "same concept">;
+def note_ambiguous_atomic_constraints_similar_expression : Note<
+  "similar constraint expression here">;
 
 def err_template_different_requires_clause : Error<
   "requires clause differs in template redeclaration">;
@@ -3952,6 +3964,9 @@ def note_ovl_candidate_disabled_by_extension : Note<
 def err_addrof_function_disabled_by_enable_if_attr : Error<
     "cannot take address of function %0 because it has one or more "
     "non-tautological enable_if conditions">;
+def err_addrof_function_constraints_not_satisfied : Error<
+    "cannot take address of function %0 because its constraints are not "
+    "satisfied">;
 def note_addrof_ovl_candidate_disabled_by_enable_if_attr : Note<
     "candidate function made ineligible by enable_if">;
 def note_ovl_candidate_deduced_mismatch : Note<
@@ -4065,6 +4080,9 @@ def note_ovl_candidate_bad_target : Note<
     "call to "
     "%select{__device__|__global__|__host__|__host__ __device__|invalid}3 function from"
     " %select{__device__|__global__|__host__|__host__ __device__|invalid}4 function">;
+def note_ovl_candidate_constraints_not_satisfied : Note<
+    "candidate %sub{select_ovl_candidate_kind}0,1,2 not viable: constraints "
+    "not satisfied">;
 def note_implicit_member_target_infer_collision : Note<
     "implicit %sub{select_special_member_kind}0 inferred target collision: call to both "
     "%select{__device__|__global__|__host__|__host__ __device__}1 and "
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 87091a3250715..86a04e33ce760 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -684,7 +684,7 @@ let params = [s16, s32], pnt = PNT_NType in {
   defm vqrshrun : VSHRN<UHalfVector, imm_1toHalfN, (? 1,0,1,0)>;
 }
 let params = T.Int, pnt = PNT_NType in {
-  defm vsli : DyadicImmShift<Vector, imm_1toN>;
+  defm vsli : DyadicImmShift<Vector, imm_0toNm1>;
   defm vsri : DyadicImmShift<Vector, imm_1toN>;
 }
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index af5b281636fe9..ec7641ffbc52c 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1679,6 +1679,9 @@ class Parser : public CodeCompletionHandler {
   ExprResult ParseConstantExpression(TypeCastState isTypeCast = NotTypeCast);
   ExprResult ParseCaseExpression(SourceLocation CaseLoc);
   ExprResult ParseConstraintExpression();
+  ExprResult
+  ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause);
+  ExprResult ParseConstraintLogicalOrExpression(bool IsTrailingRequiresClause);
   // Expr that doesn't include commas.
   ExprResult ParseAssignmentExpression(TypeCastState isTypeCast = NotTypeCast);
 
@@ -1693,15 +1696,23 @@ class Parser : public CodeCompletionHandler {
 
   ExprResult ParseRHSOfBinaryExpression(ExprResult LHS,
                                         prec::Level MinPrec);
-  ExprResult ParseCastExpression(bool isUnaryExpression,
+  /// Control what ParseCastExpression will parse.
+  enum CastParseKind {
+    AnyCastExpr = 0,
+    UnaryExprOnly,
+    PrimaryExprOnly
+  };
+  ExprResult ParseCastExpression(CastParseKind ParseKind,
                                  bool isAddressOfOperand,
                                  bool &NotCastExpr,
                                  TypeCastState isTypeCast,
-                                 bool isVectorLiteral = false);
-  ExprResult ParseCastExpression(bool isUnaryExpression,
+                                 bool isVectorLiteral = false,
+                                 bool *NotPrimaryExpression = nullptr);
+  ExprResult ParseCastExpression(CastParseKind ParseKind,
                                  bool isAddressOfOperand = false,
                                  TypeCastState isTypeCast = NotTypeCast,
-                                 bool isVectorLiteral = false);
+                                 bool isVectorLiteral = false,
+                                 bool *NotPrimaryExpression = nullptr);
 
   /// Returns true if the next token cannot start an expression.
   bool isNotExpressionStart();
@@ -1910,6 +1921,11 @@ class Parser : public CodeCompletionHandler {
 
   ExprResult ParseCoyieldExpression();
 
+  //===--------------------------------------------------------------------===//
+  // C++ Concepts
+
+  void ParseTrailingRequiresClause(Declarator &D);
+
   //===--------------------------------------------------------------------===//
   // C99 6.7.8: Initialization.
 
@@ -2739,6 +2755,9 @@ class Parser : public CodeCompletionHandler {
                                BalancedDelimiterTracker &Tracker,
                                bool IsAmbiguous,
                                bool RequiresArg = false);
+  void InitCXXThisScopeForDeclaratorIfRelevant(
+      const Declarator &D, const DeclSpec &DS,
+      llvm::Optional<Sema::CXXThisScopeRAII> &ThisScope);
   bool ParseRefQualifier(bool &RefQualifierIsLValueRef,
                          SourceLocation &RefQualifierLoc);
   bool isFunctionDeclaratorIdentifierList();
@@ -2846,10 +2865,11 @@ class Parser : public CodeCompletionHandler {
                                    Decl *TagDecl);
   ExprResult ParseCXXMemberInitializer(Decl *D, bool IsFunction,
                                        SourceLocation &EqualLoc);
-  bool ParseCXXMemberDeclaratorBeforeInitializer(Declarator &DeclaratorInfo,
-                                                 VirtSpecifiers &VS,
-                                                 ExprResult &BitfieldSize,
-                                                 LateParsedAttrList &LateAttrs);
+  bool
+  ParseCXXMemberDeclaratorBeforeInitializer(Declarator &DeclaratorInfo,
+                                            VirtSpecifiers &VS,
+                                            ExprResult &BitfieldSize,
+                                            LateParsedAttrList &LateAttrs);
   void MaybeParseAndDiagnoseDeclSpecAfterCXX11VirtSpecifierSeq(Declarator &D,
                                                                VirtSpecifiers &VS);
   DeclGroupPtrTy ParseCXXClassMemberDeclaration(
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index e3ead60bb43f6..aceec9cbe1c9e 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -1826,6 +1826,10 @@ class Declarator {
   /// The asm label, if specified.
   Expr *AsmLabel;
 
+  /// \brief The constraint-expression specified by the trailing
+  /// requires-clause, or null if no such clause was specified.
+  Expr *TrailingRequiresClause;
+
 #ifndef _MSC_VER
   union {
 #endif
@@ -1855,7 +1859,8 @@ class Declarator {
         GroupingParens(false), FunctionDefinition(FDK_Declaration),
         Redeclaration(false), Extension(false), ObjCIvar(false),
         ObjCWeakProperty(false), InlineStorageUsed(false),
-        Attrs(ds.getAttributePool().getFactory()), AsmLabel(nullptr) {}
+        Attrs(ds.getAttributePool().getFactory()), AsmLabel(nullptr),
+        TrailingRequiresClause(nullptr) {}
 
   ~Declarator() {
     clear();
@@ -2401,6 +2406,22 @@ class Declarator {
     return false;
   }
 
+  /// \brief Sets a trailing requires clause for this declarator.
+  void setTrailingRequiresClause(Expr *TRC) {
+    TrailingRequiresClause = TRC;
+  }
+
+  /// \brief Sets a trailing requires clause for this declarator.
+  Expr *getTrailingRequiresClause() {
+    return TrailingRequiresClause;
+  }
+
+  /// \brief Determine whether a trailing requires clause was written in this
+  /// declarator.
+  bool hasTrailingRequiresClause() const {
+    return TrailingRequiresClause != nullptr;
+  }
+
   /// takeAttributes - Takes attributes from the given parsed-attributes
   /// set and add them to this declarator.
   ///
diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index e0c3ba13ef543..0ccb658c6a771 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -754,7 +754,11 @@ class Sema;
     /// This constructor/conversion candidate fail due to an address space
     /// mismatch between the object being constructed and the overload
     /// candidate.
-    ovl_fail_object_addrspace_mismatch
+    ovl_fail_object_addrspace_mismatch,
+
+    /// This candidate was not viable because its associated constraints were
+    /// not satisfied.
+    ovl_fail_constraints_not_satisfied,
   };
 
   /// A list of implicit conversion sequences for the arguments of an
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index e712745c963fb..cb436d8c9ed23 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -47,6 +47,7 @@
 #include "clang/Sema/ObjCMethodList.h"
 #include "clang/Sema/Ownership.h"
 #include "clang/Sema/Scope.h"
+#include "clang/Sema/SemaConcept.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "clang/Sema/Weak.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -2428,6 +2429,8 @@ class Sema final {
                                 SkipBodyInfo *SkipBody = nullptr);
   Decl *ActOnStartOfFunctionDef(Scope *S, Decl *D,
                                 SkipBodyInfo *SkipBody = nullptr);
+  void ActOnStartTrailingRequiresClause(Scope *S, Declarator &D);
+  ExprResult ActOnFinishTrailingRequiresClause(ExprResult ConstraintExpr);
   void ActOnStartOfObjCMethodDef(Scope *S, Decl *D);
   bool isObjCMethodDecl(Decl *D) {
     return D && isa<ObjCMethodDecl>(D);
@@ -3012,7 +3015,8 @@ class Sema final {
                              NamedDecl *&OldDecl,
                              bool IsForUsingDecl);
   bool IsOverload(FunctionDecl *New, FunctionDecl *Old, bool IsForUsingDecl,
-                  bool ConsiderCudaAttrs = true);
+                  bool ConsiderCudaAttrs = true,
+                  bool ConsiderRequiresClauses = true);
 
   ImplicitConversionSequence
   TryImplicitConversion(Expr *From, QualType ToType,
@@ -3377,10 +3381,9 @@ class Sema final {
                                      bool *pHadMultipleCandidates = nullptr);
 
   FunctionDecl *
-  resolveAddressOfOnlyViableOverloadCandidate(Expr *E,
-                                              DeclAccessPair &FoundResult);
+  resolveAddressOfSingleOverloadCandidate(Expr *E, DeclAccessPair &FoundResult);
 
-  bool resolveAndFixAddressOfOnlyViableOverloadCandidate(
+  bool resolveAndFixAddressOfSingleOverloadCandidate(
       ExprResult &SrcExpr, bool DoFunctionPointerConversion = false);
 
   FunctionDecl *
@@ -6172,7 +6175,8 @@ class Sema final {
                                        TypeSourceInfo *MethodType,
                                        SourceLocation EndLoc,
                                        ArrayRef<ParmVarDecl *> Params,
-                                       ConstexprSpecKind ConstexprKind);
+                                       ConstexprSpecKind ConstexprKind,
+                                       Expr *TrailingRequiresClause);
 
   /// Number lambda for linkage purposes if necessary.
   void handleLambdaNumbering(
@@ -6306,16 +6310,35 @@ class Sema final {
                                            Expr *Src);
 
   /// Check whether the given expression is a valid constraint expression.
-  /// A diagnostic is emitted if it is not, and false is returned.
-  bool CheckConstraintExpression(Expr *CE);
+  /// A diagnostic is emitted if it is not, false is returned, and
+  /// PossibleNonPrimary will be set to true if the failure might be due to a
+  /// non-primary expression being used as an atomic constraint.
+  bool CheckConstraintExpression(Expr *CE, Token NextToken = Token(),
+                                 bool *PossibleNonPrimary = nullptr,
+                                 bool IsTrailingRequiresClause = false);
+
+  /// Check whether the given type-dependent expression will be the name of a
+  /// function or another callable function-like entity (e.g. a function
+  // template or overload set) for any substitution.
+  bool IsDependentFunctionNameExpr(Expr *E);
 
 private:
-  /// \brief Caches pairs of template-like decls whose associated constraints
-  /// were checked for subsumption and whether or not the first's constraints
-  /// did in fact subsume the second's.
+  /// Caches pairs of template-like decls whose associated constraints were
+  /// checked for subsumption and whether or not the first's constraints did in
+  /// fact subsume the second's.
   llvm::DenseMap<std::pair<NamedDecl *, NamedDecl *>, bool> SubsumptionCache;
+  /// Caches the normalized associated constraints of declarations (concepts or
+  /// constrained declarations). If an error occurred while normalizing the
+  /// associated constraints of the template or concept, nullptr will be cached
+  /// here.
+  llvm::DenseMap<NamedDecl *, NormalizedConstraint *>
+      NormalizationCache;
 
 public:
+  const NormalizedConstraint *
+  getNormalizedAssociatedConstraints(
+      NamedDecl *ConstrainedDecl, ArrayRef<const Expr *> AssociatedConstraints);
+
   /// \brief Check whether the given declaration's associated constraints are
   /// at least as constrained than another declaration's according to the
   /// partial ordering of constraints.
@@ -6328,6 +6351,13 @@ class Sema final {
                               NamedDecl *D2, ArrayRef<const Expr *> AC2,
                               bool &Result);
 
+  /// If D1 was not at least as constrained as D2, but would've been if a pair
+  /// of atomic constraints involved had been declared in a concept and not
+  /// repeated in two separate places in code.
+  /// \returns true if such a diagnostic was emitted, false otherwise.
+  bool MaybeEmitAmbiguousAtomicConstraintsDiagnostic(NamedDecl *D1,
+      ArrayRef<const Expr *> AC1, NamedDecl *D2, ArrayRef<const Expr *> AC2);
+
   /// \brief Check whether the given list of constraint expressions are
   /// satisfied (as if in a 'conjunction') given template arguments.
   /// \param ConstraintExprs a list of constraint expressions, treated as if
@@ -8781,6 +8811,10 @@ class Sema final {
 
   void InstantiateExceptionSpec(SourceLocation PointOfInstantiation,
                                 FunctionDecl *Function);
+  bool CheckInstantiatedFunctionTemplateConstraints(
+      SourceLocation PointOfInstantiation, FunctionDecl *Decl,
+      ArrayRef<TemplateArgument> TemplateArgs,
+      ConstraintSatisfaction &Satisfaction);
   FunctionDecl *InstantiateFunctionDeclaration(FunctionTemplateDecl *FTD,
                                                const TemplateArgumentList *Args,
                                                SourceLocation Loc);
diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h
new file mode 100644
index 0000000000000..acd1e604211a1
--- /dev/null
+++ b/clang/include/clang/Sema/SemaConcept.h
@@ -0,0 +1,145 @@
+//===-- SemaConcept.h - Semantic Analysis for Constraints and Concepts ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+//  This file provides semantic analysis for C++ constraints and concepts.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SEMA_SEMACONCEPT_H
+#define LLVM_CLANG_SEMA_SEMACONCEPT_H
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Expr.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+namespace clang {
+class Sema;
+
+struct AtomicConstraint {
+  const Expr *ConstraintExpr;
+  Optional<MutableArrayRef<TemplateArgumentLoc>> ParameterMapping;
+
+  AtomicConstraint(Sema &S, const Expr *ConstraintExpr) :
+      ConstraintExpr(ConstraintExpr) { };
+
+  bool hasMatchingParameterMapping(ASTContext &C,
+                                   const AtomicConstraint &Other) const {
+    if (!ParameterMapping != !Other.ParameterMapping)
+      return false;
+    if (!ParameterMapping)
+      return true;
+    if (ParameterMapping->size() != Other.ParameterMapping->size())
+      return false;
+
+    for (unsigned I = 0, S = ParameterMapping->size(); I < S; ++I)
+      if (!C.getCanonicalTemplateArgument((*ParameterMapping)[I].getArgument())
+               .structurallyEquals(C.getCanonicalTemplateArgument(
+                  (*Other.ParameterMapping)[I].getArgument())))
+        return false;
+    return true;
+  }
+
+  bool subsumes(ASTContext &C, const AtomicConstraint &Other) const {
+    // C++ [temp.constr.order] p2
+    //   - an atomic constraint A subsumes another atomic constraint B
+    //     if and only if the A and B are identical [...]
+    //
+    // C++ [temp.constr.atomic] p2
+    //   Two atomic constraints are identical if they are formed from the
+    //   same expression and the targets of the parameter mappings are
+    //   equivalent according to the rules for expressions [...]
+
+    // We do not actually substitute the parameter mappings into the
+    // constraint expressions, therefore the constraint expressions are
+    // the originals, and comparing them will suffice.
+    if (ConstraintExpr != Other.ConstraintExpr)
+      return false;
+
+    // Check that the parameter lists are identical
+    return hasMatchingParameterMapping(C, Other);
+  }
+};
+
+/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
+/// either an atomic constraint, a conjunction of normalized constraints or a
+/// disjunction of normalized constraints.
+struct NormalizedConstraint {
+  friend class Sema;
+
+  enum CompoundConstraintKind { CCK_Conjunction, CCK_Disjunction };
+
+  using CompoundConstraint = llvm::PointerIntPair<
+      std::pair<NormalizedConstraint, NormalizedConstraint> *, 1,
+      CompoundConstraintKind>;
+
+  llvm::PointerUnion<AtomicConstraint *, CompoundConstraint> Constraint;
+
+  NormalizedConstraint(AtomicConstraint *C): Constraint{C} { };
+  NormalizedConstraint(ASTContext &C, NormalizedConstraint LHS,
+                       NormalizedConstraint RHS, CompoundConstraintKind Kind)
+      : Constraint{CompoundConstraint{
+            new (C) std::pair<NormalizedConstraint, NormalizedConstraint>{
+                std::move(LHS), std::move(RHS)}, Kind}} { };
+
+  NormalizedConstraint(ASTContext &C, const NormalizedConstraint &Other) {
+    if (Other.isAtomic()) {
+      Constraint = new (C) AtomicConstraint(*Other.getAtomicConstraint());
+    } else {
+      Constraint = CompoundConstraint(
+          new (C) std::pair<NormalizedConstraint, NormalizedConstraint>{
+              NormalizedConstraint(C, Other.getLHS()),
+              NormalizedConstraint(C, Other.getRHS())},
+              Other.getCompoundKind());
+    }
+  }
+  NormalizedConstraint(NormalizedConstraint &&Other):
+      Constraint(Other.Constraint) {
+    Other.Constraint = nullptr;
+  }
+  NormalizedConstraint &operator=(const NormalizedConstraint &Other) = delete;
+  NormalizedConstraint &operator=(NormalizedConstraint &&Other) {
+    if (&Other != this) {
+      NormalizedConstraint Temp(std::move(Other));
+      std::swap(Constraint, Temp.Constraint);
+    }
+    return *this;
+  }
+
+  CompoundConstraintKind getCompoundKind() const {
+    assert(!isAtomic() && "getCompoundKind called on atomic constraint.");
+    return Constraint.get<CompoundConstraint>().getInt();
+  }
+
+  bool isAtomic() const { return Constraint.is<AtomicConstraint *>(); }
+
+  NormalizedConstraint &getLHS() const {
+    assert(!isAtomic() && "getLHS called on atomic constraint.");
+    return Constraint.get<CompoundConstraint>().getPointer()->first;
+  }
+
+  NormalizedConstraint &getRHS() const {
+    assert(!isAtomic() && "getRHS called on atomic constraint.");
+    return Constraint.get<CompoundConstraint>().getPointer()->second;
+  }
+
+  AtomicConstraint *getAtomicConstraint() const {
+    assert(isAtomic() &&
+           "getAtomicConstraint called on non-atomic constraint.");
+    return Constraint.get<AtomicConstraint *>();
+  }
+
+private:
+  static Optional<NormalizedConstraint>
+  fromConstraintExprs(Sema &S, NamedDecl *D, ArrayRef<const Expr *> E);
+  static Optional<NormalizedConstraint>
+  fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E);
+};
+
+} // clang
+
+#endif //LLVM_CLANG_SEMA_SEMACONCEPT_H
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 567d2bf7d228d..f6c3aa1a3c1d2 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -3279,10 +3279,12 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
   TypeSourceInfo *TInfo;
   SourceLocation ToInnerLocStart, ToEndLoc;
   NestedNameSpecifierLoc ToQualifierLoc;
+  Expr *TrailingRequiresClause;
   if (auto Imp = importSeq(
       FromTy, D->getTypeSourceInfo(), D->getInnerLocStart(),
-      D->getQualifierLoc(), D->getEndLoc()))
-    std::tie(T, TInfo, ToInnerLocStart, ToQualifierLoc, ToEndLoc) = *Imp;
+      D->getQualifierLoc(), D->getEndLoc(), D->getTrailingRequiresClause()))
+    std::tie(T, TInfo, ToInnerLocStart, ToQualifierLoc, ToEndLoc,
+             TrailingRequiresClause) = *Imp;
   else
     return Imp.takeError();
 
@@ -3311,7 +3313,10 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
             ExplicitSpecifier(
                 ExplicitExpr,
                 FromConstructor->getExplicitSpecifier().getKind()),
-            D->isInlineSpecified(), D->isImplicit(), D->getConstexprKind()))
+            D->isInlineSpecified(), D->isImplicit(), D->getConstexprKind(),
+            InheritedConstructor(), // FIXME: Properly import inherited
+                                    // constructor info
+            TrailingRequiresClause))
       return ToFunction;
   } else if (CXXDestructorDecl *FromDtor = dyn_cast<CXXDestructorDecl>(D)) {
 
@@ -3329,7 +3334,7 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
     if (GetImportedOrCreateDecl<CXXDestructorDecl>(
         ToFunction, D, Importer.getToContext(), cast<CXXRecordDecl>(DC),
         ToInnerLocStart, NameInfo, T, TInfo, D->isInlineSpecified(),
-        D->isImplicit(), D->getConstexprKind()))
+        D->isImplicit(), D->getConstexprKind(), TrailingRequiresClause))
       return ToFunction;
 
     CXXDestructorDecl *ToDtor = cast<CXXDestructorDecl>(ToFunction);
@@ -3349,20 +3354,21 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
             ToInnerLocStart, NameInfo, T, TInfo, D->isInlineSpecified(),
             ExplicitSpecifier(ExplicitExpr,
                               FromConversion->getExplicitSpecifier().getKind()),
-            D->getConstexprKind(), SourceLocation()))
+            D->getConstexprKind(), SourceLocation(), TrailingRequiresClause))
       return ToFunction;
   } else if (auto *Method = dyn_cast<CXXMethodDecl>(D)) {
     if (GetImportedOrCreateDecl<CXXMethodDecl>(
             ToFunction, D, Importer.getToContext(), cast<CXXRecordDecl>(DC),
             ToInnerLocStart, NameInfo, T, TInfo, Method->getStorageClass(),
             Method->isInlineSpecified(), D->getConstexprKind(),
-            SourceLocation()))
+            SourceLocation(), TrailingRequiresClause))
       return ToFunction;
   } else {
     if (GetImportedOrCreateDecl(
             ToFunction, D, Importer.getToContext(), DC, ToInnerLocStart,
             NameInfo, T, TInfo, D->getStorageClass(), D->isInlineSpecified(),
-            D->hasWrittenPrototype(), D->getConstexprKind()))
+            D->hasWrittenPrototype(), D->getConstexprKind(),
+            TrailingRequiresClause))
       return ToFunction;
   }
 
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 6cfd4c2a2a218..be59d88b73f13 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -1839,23 +1839,27 @@ void DeclaratorDecl::setQualifierInfo(NestedNameSpecifierLoc QualifierLoc) {
     }
     // Set qualifier info.
     getExtInfo()->QualifierLoc = QualifierLoc;
-  } else {
+  } else if (hasExtInfo()) {
     // Here Qualifier == 0, i.e., we are removing the qualifier (if any).
-    if (hasExtInfo()) {
-      if (getExtInfo()->NumTemplParamLists == 0) {
-        // Save type source info pointer.
-        TypeSourceInfo *savedTInfo = getExtInfo()->TInfo;
-        // Deallocate the extended decl info.
-        getASTContext().Deallocate(getExtInfo());
-        // Restore savedTInfo into (non-extended) decl info.
-        DeclInfo = savedTInfo;
-      }
-      else
-        getExtInfo()->QualifierLoc = QualifierLoc;
-    }
+    getExtInfo()->QualifierLoc = QualifierLoc;
   }
 }
 
+void DeclaratorDecl::setTrailingRequiresClause(Expr *TrailingRequiresClause) {
+  assert(TrailingRequiresClause);
+  // Make sure the extended decl info is allocated.
+  if (!hasExtInfo()) {
+    // Save (non-extended) type source info pointer.
+    auto *savedTInfo = DeclInfo.get<TypeSourceInfo*>();
+    // Allocate external info struct.
+    DeclInfo = new (getASTContext()) ExtInfo;
+    // Restore savedTInfo into (extended) decl info.
+    getExtInfo()->TInfo = savedTInfo;
+  }
+  // Set requires clause info.
+  getExtInfo()->TrailingRequiresClause = TrailingRequiresClause;
+}
+
 void DeclaratorDecl::setTemplateParameterListsInfo(
     ASTContext &Context, ArrayRef<TemplateParameterList *> TPLists) {
   assert(!TPLists.empty());
@@ -2777,7 +2781,8 @@ FunctionDecl::FunctionDecl(Kind DK, ASTContext &C, DeclContext *DC,
                            const DeclarationNameInfo &NameInfo, QualType T,
                            TypeSourceInfo *TInfo, StorageClass S,
                            bool isInlineSpecified,
-                           ConstexprSpecKind ConstexprKind)
+                           ConstexprSpecKind ConstexprKind,
+                           Expr *TrailingRequiresClause)
     : DeclaratorDecl(DK, DC, NameInfo.getLoc(), NameInfo.getName(), T, TInfo,
                      StartLoc),
       DeclContext(DK), redeclarable_base(C), Body(), ODRHash(0),
@@ -2807,6 +2812,8 @@ FunctionDecl::FunctionDecl(Kind DK, ASTContext &C, DeclContext *DC,
   FunctionDeclBits.IsMultiVersion = false;
   FunctionDeclBits.IsCopyDeductionCandidate = false;
   FunctionDeclBits.HasODRHash = false;
+  if (TrailingRequiresClause)
+    setTrailingRequiresClause(TrailingRequiresClause);
 }
 
 void FunctionDecl::getNameForDiagnostic(
@@ -3872,6 +3879,11 @@ unsigned FunctionDecl::getMemoryFunctionKind() const {
   case Builtin::BImemcpy:
     return Builtin::BImemcpy;
 
+  case Builtin::BI__builtin_mempcpy:
+  case Builtin::BI__builtin___mempcpy_chk:
+  case Builtin::BImempcpy:
+    return Builtin::BImempcpy;
+
   case Builtin::BI__builtin_memmove:
   case Builtin::BI__builtin___memmove_chk:
   case Builtin::BImemmove:
@@ -3929,6 +3941,8 @@ unsigned FunctionDecl::getMemoryFunctionKind() const {
         return Builtin::BImemset;
       else if (FnInfo->isStr("memcpy"))
         return Builtin::BImemcpy;
+      else if (FnInfo->isStr("mempcpy"))
+        return Builtin::BImempcpy;
       else if (FnInfo->isStr("memmove"))
         return Builtin::BImemmove;
       else if (FnInfo->isStr("memcmp"))
@@ -4683,10 +4697,12 @@ FunctionDecl *FunctionDecl::Create(ASTContext &C, DeclContext *DC,
                                    QualType T, TypeSourceInfo *TInfo,
                                    StorageClass SC, bool isInlineSpecified,
                                    bool hasWrittenPrototype,
-                                   ConstexprSpecKind ConstexprKind) {
+                                   ConstexprSpecKind ConstexprKind,
+                                   Expr *TrailingRequiresClause) {
   FunctionDecl *New =
       new (C, DC) FunctionDecl(Function, C, DC, StartLoc, NameInfo, T, TInfo,
-                               SC, isInlineSpecified, ConstexprKind);
+                               SC, isInlineSpecified, ConstexprKind,
+                               TrailingRequiresClause);
   New->setHasWrittenPrototype(hasWrittenPrototype);
   return New;
 }
@@ -4694,7 +4710,7 @@ FunctionDecl *FunctionDecl::Create(ASTContext &C, DeclContext *DC,
 FunctionDecl *FunctionDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
   return new (C, ID) FunctionDecl(Function, C, nullptr, SourceLocation(),
                                   DeclarationNameInfo(), QualType(), nullptr,
-                                  SC_None, false, CSK_unspecified);
+                                  SC_None, false, CSK_unspecified, nullptr);
 }
 
 BlockDecl *BlockDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L) {
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index caa60408b5b67..bc75c4e544d28 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -2041,16 +2041,19 @@ CXXMethodDecl *CXXMethodDecl::Create(ASTContext &C, CXXRecordDecl *RD,
                                      QualType T, TypeSourceInfo *TInfo,
                                      StorageClass SC, bool isInline,
                                      ConstexprSpecKind ConstexprKind,
-                                     SourceLocation EndLocation) {
+                                     SourceLocation EndLocation,
+                                     Expr *TrailingRequiresClause) {
   return new (C, RD)
       CXXMethodDecl(CXXMethod, C, RD, StartLoc, NameInfo, T, TInfo, SC,
-                    isInline, ConstexprKind, EndLocation);
+                    isInline, ConstexprKind, EndLocation,
+                    TrailingRequiresClause);
 }
 
 CXXMethodDecl *CXXMethodDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
   return new (C, ID) CXXMethodDecl(
       CXXMethod, C, nullptr, SourceLocation(), DeclarationNameInfo(),
-      QualType(), nullptr, SC_None, false, CSK_unspecified, SourceLocation());
+      QualType(), nullptr, SC_None, false, CSK_unspecified, SourceLocation(),
+      nullptr);
 }
 
 CXXMethodDecl *CXXMethodDecl::getDevirtualizedMethod(const Expr *Base,
@@ -2431,9 +2434,11 @@ CXXConstructorDecl::CXXConstructorDecl(
     ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
     const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
     ExplicitSpecifier ES, bool isInline, bool isImplicitlyDeclared,
-    ConstexprSpecKind ConstexprKind, InheritedConstructor Inherited)
+    ConstexprSpecKind ConstexprKind, InheritedConstructor Inherited,
+    Expr *TrailingRequiresClause)
     : CXXMethodDecl(CXXConstructor, C, RD, StartLoc, NameInfo, T, TInfo,
-                    SC_None, isInline, ConstexprKind, SourceLocation()) {
+                    SC_None, isInline, ConstexprKind, SourceLocation(),
+                    TrailingRequiresClause) {
   setNumCtorInitializers(0);
   setInheritingConstructor(static_cast<bool>(Inherited));
   setImplicit(isImplicitlyDeclared);
@@ -2457,7 +2462,7 @@ CXXConstructorDecl *CXXConstructorDecl::CreateDeserialized(ASTContext &C,
   auto *Result = new (C, ID, Extra)
       CXXConstructorDecl(C, nullptr, SourceLocation(), DeclarationNameInfo(),
                          QualType(), nullptr, ExplicitSpecifier(), false, false,
-                         CSK_unspecified, InheritedConstructor());
+                         CSK_unspecified, InheritedConstructor(), nullptr);
   Result->setInheritingConstructor(isInheritingConstructor);
   Result->CXXConstructorDeclBits.HasTrailingExplicitSpecifier =
       hasTraillingExplicit;
@@ -2469,7 +2474,8 @@ CXXConstructorDecl *CXXConstructorDecl::Create(
     ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
     const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
     ExplicitSpecifier ES, bool isInline, bool isImplicitlyDeclared,
-    ConstexprSpecKind ConstexprKind, InheritedConstructor Inherited) {
+    ConstexprSpecKind ConstexprKind, InheritedConstructor Inherited,
+    Expr *TrailingRequiresClause) {
   assert(NameInfo.getName().getNameKind()
          == DeclarationName::CXXConstructorName &&
          "Name must refer to a constructor");
@@ -2478,7 +2484,8 @@ CXXConstructorDecl *CXXConstructorDecl::Create(
           Inherited ? 1 : 0, ES.getExpr() ? 1 : 0);
   return new (C, RD, Extra)
       CXXConstructorDecl(C, RD, StartLoc, NameInfo, T, TInfo, ES, isInline,
-                         isImplicitlyDeclared, ConstexprKind, Inherited);
+                         isImplicitlyDeclared, ConstexprKind, Inherited,
+                         TrailingRequiresClause);
 }
 
 CXXConstructorDecl::init_const_iterator CXXConstructorDecl::init_begin() const {
@@ -2599,19 +2606,22 @@ CXXDestructorDecl *
 CXXDestructorDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
   return new (C, ID)
       CXXDestructorDecl(C, nullptr, SourceLocation(), DeclarationNameInfo(),
-                        QualType(), nullptr, false, false, CSK_unspecified);
+                        QualType(), nullptr, false, false, CSK_unspecified,
+                        nullptr);
 }
 
 CXXDestructorDecl *CXXDestructorDecl::Create(
     ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
     const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
-    bool isInline, bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind) {
+    bool isInline, bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind,
+    Expr *TrailingRequiresClause) {
   assert(NameInfo.getName().getNameKind()
          == DeclarationName::CXXDestructorName &&
          "Name must refer to a destructor");
   return new (C, RD)
       CXXDestructorDecl(C, RD, StartLoc, NameInfo, T, TInfo, isInline,
-                        isImplicitlyDeclared, ConstexprKind);
+                        isImplicitlyDeclared, ConstexprKind,
+                        TrailingRequiresClause);
 }
 
 void CXXDestructorDecl::setOperatorDelete(FunctionDecl *OD, Expr *ThisArg) {
@@ -2630,20 +2640,20 @@ CXXConversionDecl *
 CXXConversionDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
   return new (C, ID) CXXConversionDecl(
       C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr,
-      false, ExplicitSpecifier(), CSK_unspecified, SourceLocation());
+      false, ExplicitSpecifier(), CSK_unspecified, SourceLocation(), nullptr);
 }
 
 CXXConversionDecl *CXXConversionDecl::Create(
     ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
     const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
     bool isInline, ExplicitSpecifier ES, ConstexprSpecKind ConstexprKind,
-    SourceLocation EndLocation) {
+    SourceLocation EndLocation, Expr *TrailingRequiresClause) {
   assert(NameInfo.getName().getNameKind()
          == DeclarationName::CXXConversionFunctionName &&
          "Name must refer to a conversion function");
   return new (C, RD)
       CXXConversionDecl(C, RD, StartLoc, NameInfo, T, TInfo, isInline, ES,
-                        ConstexprKind, EndLocation);
+                        ConstexprKind, EndLocation, TrailingRequiresClause);
 }
 
 bool CXXConversionDecl::isLambdaToBlockPointerConversion() const {
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 0702256d7a2be..50aab3c080954 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -746,6 +746,11 @@ void DeclPrinter::VisitFunctionDecl(FunctionDecl *D) {
       Proto.clear();
     }
     Out << Proto;
+
+    if (Expr *TrailingRequiresClause = D->getTrailingRequiresClause()) {
+      Out << " requires ";
+      TrailingRequiresClause->printPretty(Out, nullptr, SubPolicy, Indentation);
+    }
   } else {
     Ty.print(Out, Policy, Proto);
   }
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 23734396b7694..59fa7faad927d 100755
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -171,13 +171,18 @@ void TemplateDecl::anchor() {}
 
 void TemplateDecl::
 getAssociatedConstraints(llvm::SmallVectorImpl<const Expr *> &AC) const {
-  // TODO: Concepts: Append function trailing requires clause.
   TemplateParams->getAssociatedConstraints(AC);
+  if (auto *FD = dyn_cast_or_null<FunctionDecl>(getTemplatedDecl()))
+    if (const Expr *TRC = FD->getTrailingRequiresClause())
+      AC.push_back(TRC);
 }
 
 bool TemplateDecl::hasAssociatedConstraints() const {
-  // TODO: Concepts: Regard function trailing requires clause.
-  return TemplateParams->hasAssociatedConstraints();
+  if (TemplateParams->hasAssociatedConstraints())
+    return true;
+  if (auto *FD = dyn_cast_or_null<FunctionDecl>(getTemplatedDecl()))
+    return FD->getTrailingRequiresClause();
+  return false;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 1877d4a5ef70b..bc0ffb7fa440a 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -316,7 +316,8 @@ bool PPCTargetInfo::initFeatureMap(
                         .Case("pwr8", true)
                         .Default(false);
 
-  Features["spe"] = llvm::StringSwitch<bool>(CPU)
+  Features["spe"] = getTriple().getSubArch() == llvm::Triple::PPCSubArch_spe ||
+                    llvm::StringSwitch<bool>(CPU)
                         .Case("8548", true)
                         .Case("e500", true)
                         .Default(false);
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 3076025fc5b2b..270aa7ff91815 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -87,8 +87,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
 
   // Note: GCC recognizes the following additional cpus:
   //  401, 403, 405, 405fp, 440fp, 464, 464fp, 476, 476fp, 505, 740, 801,
-  //  821, 823, 8540, 8548, e300c2, e300c3, e500mc64, e6500, 860, cell,
-  //  titan, rs64.
+  //  821, 823, 8540, e300c2, e300c3, e500mc64, e6500, 860, cell, titan, rs64.
   bool isValidCPUName(StringRef Name) const override;
   void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b73cf1c6d35eb..e2c5f1b42f854 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2500,7 +2500,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     return RValue::get(nullptr);
   }
   case Builtin::BImemcpy:
-  case Builtin::BI__builtin_memcpy: {
+  case Builtin::BI__builtin_memcpy:
+  case Builtin::BImempcpy:
+  case Builtin::BI__builtin_mempcpy: {
     Address Dest = EmitPointerWithAlignment(E->getArg(0));
     Address Src = EmitPointerWithAlignment(E->getArg(1));
     Value *SizeVal = EmitScalarExpr(E->getArg(2));
@@ -2509,7 +2511,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
                         E->getArg(1)->getExprLoc(), FD, 1);
     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
-    return RValue::get(Dest.getPointer());
+    if (BuiltinID == Builtin::BImempcpy ||
+        BuiltinID == Builtin::BI__builtin_mempcpy)
+      return RValue::get(Builder.CreateInBoundsGEP(Dest.getPointer(), SizeVal));
+    else
+      return RValue::get(Dest.getPointer());
   }
 
   case Builtin::BI__builtin_char_memchr:
@@ -4330,9 +4336,29 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     return RValue::get(V);
   }
 
-  // See if we have a target specific builtin that needs to be lowered.
-  if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E, ReturnValue))
-    return RValue::get(V);
+  // Some target-specific builtins can have aggregate return values, e.g.
+  // __builtin_arm_mve_vld2q_u32. So if the result is an aggregate, force
+  // ReturnValue to be non-null, so that the target-specific emission code can
+  // always just emit into it.
+  TypeEvaluationKind EvalKind = getEvaluationKind(E->getType());
+  if (EvalKind == TEK_Aggregate && ReturnValue.isNull()) {
+    Address DestPtr = CreateMemTemp(E->getType(), "agg.tmp");
+    ReturnValue = ReturnValueSlot(DestPtr, false);
+  }
+
+  // Now see if we can emit a target-specific builtin.
+  if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E, ReturnValue)) {
+    switch (EvalKind) {
+    case TEK_Scalar:
+      return RValue::get(V);
+    case TEK_Aggregate:
+      return RValue::getAggregate(ReturnValue.getValue(),
+                                  ReturnValue.isVolatile());
+    case TEK_Complex:
+      llvm_unreachable("No current target builtin returns complex");
+    }
+    llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
+  }
 
   ErrorUnsupported(E, "builtin function");
 
diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index 479cd8ec77cec..a27b6d4ed6374 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -1236,6 +1236,7 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
         // The first Interface we find may be a @class,
         // which should only be treated as the source of
         // truth in the absence of a true declaration.
+        assert(OID && "Failed to find ObjCInterfaceDecl");
         const ObjCInterfaceDecl *OIDDef = OID->getDefinition();
         if (OIDDef != nullptr)
           OID = OIDDef;
@@ -3036,6 +3037,7 @@ llvm::Value *CGObjCGNU::GenerateProtocolRef(CodeGenFunction &CGF,
   llvm::Constant *&protocol = ExistingProtocols[PD->getNameAsString()];
   if (!protocol)
     GenerateProtocol(PD);
+  assert(protocol && "Unknown protocol");
   llvm::Type *T =
     CGM.getTypes().ConvertType(CGM.getContext().getObjCProtoType());
   return CGF.Builder.CreateBitCast(protocol, llvm::PointerType::getUnqual(T));
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 11ba2cd9f11d9..a92aa37358d11 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -733,10 +733,6 @@ enum OpenMPRTLFunction {
   OMPRTL__tgt_target_teams_nowait,
   // Call to void __tgt_register_requires(int64_t flags);
   OMPRTL__tgt_register_requires,
-  // Call to void __tgt_register_lib(__tgt_bin_desc *desc);
-  OMPRTL__tgt_register_lib,
-  // Call to void __tgt_unregister_lib(__tgt_bin_desc *desc);
-  OMPRTL__tgt_unregister_lib,
   // Call to void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
   // void** args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
   OMPRTL__tgt_target_data_begin,
@@ -2478,26 +2474,6 @@ llvm::FunctionCallee CGOpenMPRuntime::createRuntimeFunction(unsigned Function) {
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_register_requires");
     break;
   }
-  case OMPRTL__tgt_register_lib: {
-    // Build void __tgt_register_lib(__tgt_bin_desc *desc);
-    QualType ParamTy =
-        CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy());
-    llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)};
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_register_lib");
-    break;
-  }
-  case OMPRTL__tgt_unregister_lib: {
-    // Build void __tgt_unregister_lib(__tgt_bin_desc *desc);
-    QualType ParamTy =
-        CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy());
-    llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)};
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_unregister_lib");
-    break;
-  }
   case OMPRTL__tgt_target_data_begin: {
     // Build void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
     // void** args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
@@ -4378,57 +4354,6 @@ QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() {
   return TgtOffloadEntryQTy;
 }
 
-QualType CGOpenMPRuntime::getTgtDeviceImageQTy() {
-  // These are the types we need to build:
-  // struct __tgt_device_image{
-  // void   *ImageStart;       // Pointer to the target code start.
-  // void   *ImageEnd;         // Pointer to the target code end.
-  // // We also add the host entries to the device image, as it may be useful
-  // // for the target runtime to have access to that information.
-  // __tgt_offload_entry  *EntriesBegin;   // Begin of the table with all
-  //                                       // the entries.
-  // __tgt_offload_entry  *EntriesEnd;     // End of the table with all the
-  //                                       // entries (non inclusive).
-  // };
-  if (TgtDeviceImageQTy.isNull()) {
-    ASTContext &C = CGM.getContext();
-    RecordDecl *RD = C.buildImplicitRecord("__tgt_device_image");
-    RD->startDefinition();
-    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-    addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
-    addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
-    RD->completeDefinition();
-    TgtDeviceImageQTy = C.getRecordType(RD);
-  }
-  return TgtDeviceImageQTy;
-}
-
-QualType CGOpenMPRuntime::getTgtBinaryDescriptorQTy() {
-  // struct __tgt_bin_desc{
-  //   int32_t              NumDevices;      // Number of devices supported.
-  //   __tgt_device_image   *DeviceImages;   // Arrays of device images
-  //                                         // (one per device).
-  //   __tgt_offload_entry  *EntriesBegin;   // Begin of the table with all the
-  //                                         // entries.
-  //   __tgt_offload_entry  *EntriesEnd;     // End of the table with all the
-  //                                         // entries (non inclusive).
-  // };
-  if (TgtBinaryDescriptorQTy.isNull()) {
-    ASTContext &C = CGM.getContext();
-    RecordDecl *RD = C.buildImplicitRecord("__tgt_bin_desc");
-    RD->startDefinition();
-    addFieldToRecordDecl(
-        C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
-    addFieldToRecordDecl(C, RD, C.getPointerType(getTgtDeviceImageQTy()));
-    addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
-    addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
-    RD->completeDefinition();
-    TgtBinaryDescriptorQTy = C.getRecordType(RD);
-  }
-  return TgtBinaryDescriptorQTy;
-}
-
 namespace {
 struct PrivateHelpersTy {
   PrivateHelpersTy(const VarDecl *Original, const VarDecl *PrivateCopy,
@@ -11193,6 +11118,7 @@ bool checkContext<OMP_CTX_SET_device, OMP_CTX_kind, CodeGenModule &>(
     case llvm::Triple::wasm64:
     case llvm::Triple::renderscript32:
     case llvm::Triple::renderscript64:
+    case llvm::Triple::ve:
       return false;
     }
   }
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index c40308ee74971..8159f5e8b790f 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -439,29 +439,10 @@ class CGOpenMPRuntime {
   ///                          // (function or global)
   ///   char      *name;       // Name of the function or global.
   ///   size_t     size;       // Size of the entry info (0 if it a function).
+  ///   int32_t flags;
+  ///   int32_t reserved;
   /// };
   QualType TgtOffloadEntryQTy;
-  /// struct __tgt_device_image{
-  /// void   *ImageStart;       // Pointer to the target code start.
-  /// void   *ImageEnd;         // Pointer to the target code end.
-  /// // We also add the host entries to the device image, as it may be useful
-  /// // for the target runtime to have access to that information.
-  /// __tgt_offload_entry  *EntriesBegin;   // Begin of the table with all
-  ///                                       // the entries.
-  /// __tgt_offload_entry  *EntriesEnd;     // End of the table with all the
-  ///                                       // entries (non inclusive).
-  /// };
-  QualType TgtDeviceImageQTy;
-  /// struct __tgt_bin_desc{
-  ///   int32_t              NumDevices;      // Number of devices supported.
-  ///   __tgt_device_image   *DeviceImages;   // Arrays of device images
-  ///                                         // (one per device).
-  ///   __tgt_offload_entry  *EntriesBegin;   // Begin of the table with all the
-  ///                                         // entries.
-  ///   __tgt_offload_entry  *EntriesEnd;     // End of the table with all the
-  ///                                         // entries (non inclusive).
-  /// };
-  QualType TgtBinaryDescriptorQTy;
   /// Entity that registers the offloading constants that were emitted so
   /// far.
   class OffloadEntriesInfoManagerTy {
@@ -717,12 +698,6 @@ class CGOpenMPRuntime {
   /// Returns __tgt_offload_entry type.
   QualType getTgtOffloadEntryQTy();
 
-  /// Returns __tgt_device_image type.
-  QualType getTgtDeviceImageQTy();
-
-  /// Returns __tgt_bin_desc type.
-  QualType getTgtBinaryDescriptorQTy();
-
   /// Start scanning from statement \a S and and emit all target regions
   /// found along the way.
   /// \param S Starting statement.
diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index 68a57310ad402..ae1d7eaf7089f 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -63,12 +63,13 @@ static void getARMHWDivFeatures(const Driver &D, const Arg *A,
 }
 
 // Handle -mfpu=.
-static void getARMFPUFeatures(const Driver &D, const Arg *A,
+unsigned getARMFPUFeatures(const Driver &D, const Arg *A,
                               const ArgList &Args, StringRef FPU,
                               std::vector<StringRef> &Features) {
   unsigned FPUID = llvm::ARM::parseFPU(FPU);
   if (!llvm::ARM::getFPUFeatures(FPUID, Features))
     D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args);
+  return FPUID;
 }
 
 // Decode ARM features from string like +[no]featureA+[no]featureB+...
@@ -388,18 +389,20 @@ void arm::getARMTargetFeatures(const ToolChain &TC,
     checkARMCPUName(D, CPUArg, Args, CPUName, ArchName,
                     ExtensionFeatures, Triple);
   // Honor -mfpu=. ClangAs gives preference to -Wa,-mfpu=.
+  unsigned FPUID = llvm::ARM::FK_INVALID;
   const Arg *FPUArg = Args.getLastArg(options::OPT_mfpu_EQ);
   if (WaFPU) {
     if (FPUArg)
       D.Diag(clang::diag::warn_drv_unused_argument)
           << FPUArg->getAsString(Args);
-    getARMFPUFeatures(D, WaFPU, Args, StringRef(WaFPU->getValue()).substr(6),
-                      Features);
+    (void)getARMFPUFeatures(D, WaFPU, Args, StringRef(WaFPU->getValue()).substr(6),
+                            Features);
   } else if (FPUArg) {
-    getARMFPUFeatures(D, FPUArg, Args, FPUArg->getValue(), Features);
+    FPUID = getARMFPUFeatures(D, FPUArg, Args, FPUArg->getValue(), Features);
   } else if (Triple.isAndroid() && getARMSubArchVersionNumber(Triple) >= 7) {
     const char *AndroidFPU = "neon";
-    if (!llvm::ARM::getFPUFeatures(llvm::ARM::parseFPU(AndroidFPU), Features))
+    FPUID = llvm::ARM::parseFPU(AndroidFPU);
+    if (!llvm::ARM::getFPUFeatures(FPUID, Features))
       D.Diag(clang::diag::err_drv_clang_unsupported)
           << std::string("-mfpu=") + AndroidFPU;
   }
@@ -454,21 +457,21 @@ void arm::getARMTargetFeatures(const ToolChain &TC,
   if (ABI == arm::FloatABI::Soft) {
     llvm::ARM::getFPUFeatures(llvm::ARM::FK_NONE, Features);
 
-    // Disable all features relating to hardware FP.
-    // FIXME: Disabling fpregs should be enough all by itself, since all
-    //        the other FP features are dependent on it. However
-    //        there is currently no easy way to test this in clang, so for
-    //        now just be explicit and disable all known dependent features
-    //        as well.
-    for (std::string Feature : {
-            "vfp2", "vfp2sp",
-            "vfp3", "vfp3sp", "vfp3d16", "vfp3d16sp",
-            "vfp4", "vfp4sp", "vfp4d16", "vfp4d16sp",
-            "fp-armv8", "fp-armv8sp", "fp-armv8d16", "fp-armv8d16sp",
-            "fullfp16", "neon", "crypto", "dotprod", "fp16fml",
-            "mve", "mve.fp",
-            "fp64", "d32", "fpregs"})
-      Features.push_back(Args.MakeArgString("-" + Feature));
+    // Disable all features relating to hardware FP, not already disabled by the
+    // above call.
+    Features.insert(Features.end(), {"-neon", "-crypto", "-dotprod", "-fp16fml",
+                                     "-mve", "-mve.fp", "-fpregs"});
+  } else if (FPUID == llvm::ARM::FK_NONE) {
+    // -mfpu=none is *very* similar to -mfloat-abi=soft, only that it should not
+    // disable MVE-I.
+    Features.insert(Features.end(),
+                    {"-neon", "-crypto", "-dotprod", "-fp16fml", "-mve.fp"});
+    // Even though we remove MVE-FP, we still need to check if it was originally
+    // present among the requested extensions, because it implies MVE-I, which
+    // should not be disabled by -mfpu-none.
+    if (!llvm::is_contained(Features, "+mve") &&
+        !llvm::is_contained(Features, "+mve.fp"))
+      Features.emplace_back("-fpregs");
   }
 
   // En/disable crc code generation.
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 5881852b1424a..9a852141c6eea 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1053,11 +1053,9 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
 
   auto StatusOrErr =
       Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
-  if (!StatusOrErr) {
-    assert(false && "Invalid floating point representation");
-    return APFloat::opInvalidOp;
-  }
-  return *StatusOrErr;
+  assert(StatusOrErr && "Invalid floating point representation");
+  return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
+                                               : APFloat::opInvalidOp;
 }
 
 static inline bool IsExponentPart(char c) {
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 908fbf76d77fb..695c485649a40 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -2014,6 +2014,9 @@ Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS,
     return nullptr;
   }
 
+  if (Tok.is(tok::kw_requires))
+    ParseTrailingRequiresClause(D);
+
   // Save late-parsed attributes for now; they need to be parsed in the
   // appropriate function scope after the function Decl has been constructed.
   // These will be parsed in ParseFunctionDefinition or ParseLexedAttrList.
@@ -2165,6 +2168,12 @@ Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS,
 
     ParseDeclarator(D);
     if (!D.isInvalidType()) {
+      // C++2a [dcl.decl]p1
+      //    init-declarator:
+      //	      declarator initializer[opt]
+      //        declarator requires-clause
+      if (Tok.is(tok::kw_requires))
+        ParseTrailingRequiresClause(D);
       Decl *ThisDecl = ParseDeclarationAfterDeclarator(D);
       D.complete(ThisDecl);
       if (ThisDecl)
@@ -6033,6 +6042,22 @@ void Parser::ParseDirectDeclarator(Declarator &D) {
       PrototypeScope.Exit();
     } else if (Tok.is(tok::l_square)) {
       ParseBracketDeclarator(D);
+    } else if (Tok.is(tok::kw_requires) && D.hasGroupingParens()) {
+      // This declarator is declaring a function, but the requires clause is
+      // in the wrong place:
+      //   void (f() requires true);
+      // instead of
+      //   void f() requires true;
+      // or
+      //   void (f()) requires true;
+      Diag(Tok, diag::err_requires_clause_inside_parens);
+      ConsumeToken();
+      ExprResult TrailingRequiresClause = Actions.CorrectDelayedTyposInExpr(
+         ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true));
+      if (TrailingRequiresClause.isUsable() && D.isFunctionDeclarator() &&
+          !D.hasTrailingRequiresClause())
+        // We're already ill-formed if we got here but we'll accept it anyway.
+        D.setTrailingRequiresClause(TrailingRequiresClause.get());
     } else {
       break;
     }
@@ -6213,6 +6238,47 @@ void Parser::ParseParenDeclarator(Declarator &D) {
   PrototypeScope.Exit();
 }
 
+void Parser::InitCXXThisScopeForDeclaratorIfRelevant(
+    const Declarator &D, const DeclSpec &DS,
+    llvm::Optional<Sema::CXXThisScopeRAII> &ThisScope) {
+  // C++11 [expr.prim.general]p3:
+  //   If a declaration declares a member function or member function
+  //   template of a class X, the expression this is a prvalue of type
+  //   "pointer to cv-qualifier-seq X" between the optional cv-qualifer-seq
+  //   and the end of the function-definition, member-declarator, or
+  //   declarator.
+  // FIXME: currently, "static" case isn't handled correctly.
+  bool IsCXX11MemberFunction = getLangOpts().CPlusPlus11 &&
+        D.getDeclSpec().getStorageClassSpec() != DeclSpec::SCS_typedef &&
+        (D.getContext() == DeclaratorContext::MemberContext
+         ? !D.getDeclSpec().isFriendSpecified()
+         : D.getContext() == DeclaratorContext::FileContext &&
+           D.getCXXScopeSpec().isValid() &&
+           Actions.CurContext->isRecord());
+  if (!IsCXX11MemberFunction)
+    return;
+
+  Qualifiers Q = Qualifiers::fromCVRUMask(DS.getTypeQualifiers());
+  if (D.getDeclSpec().hasConstexprSpecifier() && !getLangOpts().CPlusPlus14)
+    Q.addConst();
+  // FIXME: Collect C++ address spaces.
+  // If there are multiple different address spaces, the source is invalid.
+  // Carry on using the first addr space for the qualifiers of 'this'.
+  // The diagnostic will be given later while creating the function
+  // prototype for the method.
+  if (getLangOpts().OpenCLCPlusPlus) {
+    for (ParsedAttr &attr : DS.getAttributes()) {
+      LangAS ASIdx = attr.asOpenCLLangAS();
+      if (ASIdx != LangAS::Default) {
+        Q.addAddressSpace(ASIdx);
+        break;
+      }
+    }
+  }
+  ThisScope.emplace(Actions, dyn_cast<CXXRecordDecl>(Actions.CurContext), Q,
+                    IsCXX11MemberFunction);
+}
+
 /// ParseFunctionDeclarator - We are after the identifier and have parsed the
 /// declarator D up to a paren, which indicates that we are parsing function
 /// arguments.
@@ -6226,7 +6292,8 @@ void Parser::ParseParenDeclarator(Declarator &D) {
 ///
 /// For C++, after the parameter-list, it also parses the cv-qualifier-seq[opt],
 /// (C++11) ref-qualifier[opt], exception-specification[opt],
-/// (C++11) attribute-specifier-seq[opt], and (C++11) trailing-return-type[opt].
+/// (C++11) attribute-specifier-seq[opt], (C++11) trailing-return-type[opt] and
+/// (C++2a) the trailing requires-clause.
 ///
 /// [C++11] exception-specification:
 ///           dynamic-exception-specification
@@ -6321,43 +6388,8 @@ void Parser::ParseFunctionDeclarator(Declarator &D,
       if (ParseRefQualifier(RefQualifierIsLValueRef, RefQualifierLoc))
         EndLoc = RefQualifierLoc;
 
-      // C++11 [expr.prim.general]p3:
-      //   If a declaration declares a member function or member function
-      //   template of a class X, the expression this is a prvalue of type
-      //   "pointer to cv-qualifier-seq X" between the optional cv-qualifer-seq
-      //   and the end of the function-definition, member-declarator, or
-      //   declarator.
-      // FIXME: currently, "static" case isn't handled correctly.
-      bool IsCXX11MemberFunction =
-        getLangOpts().CPlusPlus11 &&
-        D.getDeclSpec().getStorageClassSpec() != DeclSpec::SCS_typedef &&
-        (D.getContext() == DeclaratorContext::MemberContext
-         ? !D.getDeclSpec().isFriendSpecified()
-         : D.getContext() == DeclaratorContext::FileContext &&
-           D.getCXXScopeSpec().isValid() &&
-           Actions.CurContext->isRecord());
-
-      Qualifiers Q = Qualifiers::fromCVRUMask(DS.getTypeQualifiers());
-      if (D.getDeclSpec().hasConstexprSpecifier() && !getLangOpts().CPlusPlus14)
-        Q.addConst();
-      // FIXME: Collect C++ address spaces.
-      // If there are multiple different address spaces, the source is invalid.
-      // Carry on using the first addr space for the qualifiers of 'this'.
-      // The diagnostic will be given later while creating the function
-      // prototype for the method.
-      if (getLangOpts().OpenCLCPlusPlus) {
-        for (ParsedAttr &attr : DS.getAttributes()) {
-          LangAS ASIdx = attr.asOpenCLLangAS();
-          if (ASIdx != LangAS::Default) {
-            Q.addAddressSpace(ASIdx);
-            break;
-          }
-        }
-      }
-
-      Sema::CXXThisScopeRAII ThisScope(
-          Actions, dyn_cast<CXXRecordDecl>(Actions.CurContext), Q,
-          IsCXX11MemberFunction);
+      llvm::Optional<Sema::CXXThisScopeRAII> ThisScope;
+      InitCXXThisScopeForDeclaratorIfRelevant(D, DS, ThisScope);
 
       // Parse exception-specification[opt].
       bool Delayed = D.isFirstDeclarationOfMember() &&
@@ -6625,6 +6657,17 @@ void Parser::ParseParameterDeclarationClause(
     // Parse GNU attributes, if present.
     MaybeParseGNUAttributes(ParmDeclarator);
 
+    if (Tok.is(tok::kw_requires)) {
+      // User tried to define a requires clause in a parameter declaration,
+      // which is surely not a function declaration.
+      // void f(int (*g)(int, int) requires true);
+      Diag(Tok,
+           diag::err_requires_clause_on_declarator_not_declaring_a_function);
+      ConsumeToken();
+      Actions.CorrectDelayedTyposInExpr(
+         ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true));
+    }
+
     // Remember this parsed parameter in ParamInfo.
     IdentifierInfo *ParmII = ParmDeclarator.getIdentifier();
 
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index af3403403c11b..081d4d8b12092 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -2301,6 +2301,7 @@ bool Parser::ParseCXXMemberDeclaratorBeforeInitializer(
     LateParsedAttrList &LateParsedAttrs) {
   // member-declarator:
   //   declarator pure-specifier[opt]
+  //   declarator requires-clause
   //   declarator brace-or-equal-initializer[opt]
   //   identifier[opt] ':' constant-expression
   if (Tok.isNot(tok::colon))
@@ -2314,6 +2315,8 @@ bool Parser::ParseCXXMemberDeclaratorBeforeInitializer(
     BitfieldSize = ParseConstantExpression();
     if (BitfieldSize.isInvalid())
       SkipUntil(tok::comma, StopAtSemi | StopBeforeMatch);
+  } else if (Tok.is(tok::kw_requires)) {
+    ParseTrailingRequiresClause(DeclaratorInfo);
   } else {
     ParseOptionalCXX11VirtSpecifierSeq(
         VS, getCurrentClass().IsInterface,
@@ -2436,6 +2439,7 @@ void Parser::MaybeParseAndDiagnoseDeclSpecAfterCXX11VirtSpecifierSeq(
 ///
 ///       member-declarator:
 ///         declarator virt-specifier-seq[opt] pure-specifier[opt]
+/// [C++2a] declarator requires-clause
 ///         declarator constant-initializer[opt]
 /// [C++11] declarator brace-or-equal-initializer[opt]
 ///         identifier[opt] ':' constant-expression
@@ -2669,6 +2673,7 @@ Parser::ParseCXXClassMemberDeclaration(AccessSpecifier AS,
 
   SmallVector<Decl *, 8> DeclsInGroup;
   ExprResult BitfieldSize;
+  ExprResult TrailingRequiresClause;
   bool ExpectSemi = true;
 
   // Parse the first declarator.
@@ -3793,6 +3798,62 @@ TypeResult Parser::ParseTrailingReturnType(SourceRange &Range,
                                    : DeclaratorContext::TrailingReturnContext);
 }
 
+/// Parse a requires-clause as part of a function declaration.
+void Parser::ParseTrailingRequiresClause(Declarator &D) {
+  assert(Tok.is(tok::kw_requires) && "expected requires");
+
+  SourceLocation RequiresKWLoc = ConsumeToken();
+
+  ExprResult TrailingRequiresClause;
+  ParseScope ParamScope(this,
+                        Scope::DeclScope |
+                        Scope::FunctionDeclarationScope |
+                        Scope::FunctionPrototypeScope);
+
+  Actions.ActOnStartTrailingRequiresClause(getCurScope(), D);
+
+  llvm::Optional<Sema::CXXThisScopeRAII> ThisScope;
+  InitCXXThisScopeForDeclaratorIfRelevant(D, D.getDeclSpec(), ThisScope);
+
+  TrailingRequiresClause =
+      ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true);
+
+  TrailingRequiresClause =
+      Actions.ActOnFinishTrailingRequiresClause(TrailingRequiresClause);
+
+  if (!D.isDeclarationOfFunction()) {
+    Diag(RequiresKWLoc,
+         diag::err_requires_clause_on_declarator_not_declaring_a_function);
+    return;
+  }
+
+  if (TrailingRequiresClause.isInvalid())
+    SkipUntil({tok::l_brace, tok::arrow, tok::kw_try, tok::comma, tok::colon},
+              StopAtSemi | StopBeforeMatch);
+  else
+    D.setTrailingRequiresClause(TrailingRequiresClause.get());
+
+  // Did the user swap the trailing return type and requires clause?
+  if (D.isFunctionDeclarator() && Tok.is(tok::arrow) &&
+      D.getDeclSpec().getTypeSpecType() == TST_auto) {
+    SourceLocation ArrowLoc = Tok.getLocation();
+    SourceRange Range;
+    TypeResult TrailingReturnType =
+        ParseTrailingReturnType(Range, /*MayBeFollowedByDirectInit=*/false);
+
+    if (!TrailingReturnType.isInvalid()) {
+      Diag(ArrowLoc,
+           diag::err_requires_clause_must_appear_after_trailing_return)
+          << Range;
+      auto &FunctionChunk = D.getFunctionTypeInfo();
+      FunctionChunk.HasTrailingReturnType = TrailingReturnType.isUsable();
+      FunctionChunk.TrailingReturnType = TrailingReturnType.get();
+    } else
+      SkipUntil({tok::equal, tok::l_brace, tok::arrow, tok::kw_try, tok::comma},
+                StopAtSemi | StopBeforeMatch);
+  }
+}
+
 /// We have just started parsing the definition of a new class,
 /// so push that class onto our stack of classes that is currently
 /// being parsed.
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 6ef303047db49..d5fc3864599df 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -22,6 +22,7 @@
 
 #include "clang/Parse/Parser.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ExprCXX.h"
 #include "clang/Basic/PrettyStackTrace.h"
 #include "clang/Parse/RAIIObjectsForParser.h"
 #include "clang/Sema/DeclSpec.h"
@@ -145,7 +146,7 @@ Parser::ParseExpressionWithLeadingExtension(SourceLocation ExtLoc) {
     // Silence extension warnings in the sub-expression
     ExtensionRAIIObject O(Diags);
 
-    LHS = ParseCastExpression(false);
+    LHS = ParseCastExpression(AnyCastExpr);
   }
 
   if (!LHS.isInvalid())
@@ -169,7 +170,7 @@ ExprResult Parser::ParseAssignmentExpression(TypeCastState isTypeCast) {
   if (Tok.is(tok::kw_co_yield))
     return ParseCoyieldExpression();
 
-  ExprResult LHS = ParseCastExpression(/*isUnaryExpression=*/false,
+  ExprResult LHS = ParseCastExpression(AnyCastExpr,
                                        /*isAddressOfOperand=*/false,
                                        isTypeCast);
   return ParseRHSOfBinaryExpression(LHS, prec::Assignment);
@@ -202,7 +203,7 @@ Parser::ParseConstantExpressionInExprEvalContext(TypeCastState isTypeCast) {
              Sema::ExpressionEvaluationContext::ConstantEvaluated &&
          "Call this function only if your ExpressionEvaluationContext is "
          "already ConstantEvaluated");
-  ExprResult LHS(ParseCastExpression(false, false, isTypeCast));
+  ExprResult LHS(ParseCastExpression(AnyCastExpr, false, isTypeCast));
   ExprResult Res(ParseRHSOfBinaryExpression(LHS, prec::Conditional));
   return Actions.ActOnConstantExpression(Res);
 }
@@ -220,7 +221,7 @@ ExprResult Parser::ParseConstantExpression(TypeCastState isTypeCast) {
 ExprResult Parser::ParseCaseExpression(SourceLocation CaseLoc) {
   EnterExpressionEvaluationContext ConstantEvaluated(
       Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-  ExprResult LHS(ParseCastExpression(false, false, NotTypeCast));
+  ExprResult LHS(ParseCastExpression(AnyCastExpr, false, NotTypeCast));
   ExprResult Res(ParseRHSOfBinaryExpression(LHS, prec::Conditional));
   return Actions.ActOnCaseExpr(CaseLoc, Res);
 }
@@ -234,13 +235,143 @@ ExprResult Parser::ParseCaseExpression(SourceLocation CaseLoc) {
 ExprResult Parser::ParseConstraintExpression() {
   EnterExpressionEvaluationContext ConstantEvaluated(
       Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-  ExprResult LHS(ParseCastExpression(/*isUnaryExpression=*/false));
+  ExprResult LHS(ParseCastExpression(AnyCastExpr));
   ExprResult Res(ParseRHSOfBinaryExpression(LHS, prec::LogicalOr));
-  if (Res.isUsable() && !Actions.CheckConstraintExpression(Res.get()))
+  if (Res.isUsable() && !Actions.CheckConstraintExpression(Res.get())) {
+    Actions.CorrectDelayedTyposInExpr(Res);
     return ExprError();
+  }
   return Res;
 }
 
+/// \brief Parse a constraint-logical-and-expression.
+///
+/// \param RightMostExpr If provided, will receive the right-most atomic
+///                      constraint that was parsed.
+/// \verbatim
+///       C++2a[temp.constr.decl]p1
+///       constraint-logical-and-expression:
+///         primary-expression
+///         constraint-logical-and-expression '&&' primary-expression
+///
+/// \endverbatim
+ExprResult
+Parser::ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause) {
+  EnterExpressionEvaluationContext ConstantEvaluated(
+      Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  bool NotPrimaryExpression = false;
+  auto ParsePrimary = [&] () {
+    ExprResult E = ParseCastExpression(PrimaryExprOnly,
+                                       /*isAddressOfOperand=*/false,
+                                       /*isTypeCast=*/NotTypeCast,
+                                       /*isVectorLiteral=*/false,
+                                       &NotPrimaryExpression);
+    if (E.isInvalid())
+      return ExprError();
+    auto RecoverFromNonPrimary = [&] (ExprResult E, bool Note) {
+        E = ParsePostfixExpressionSuffix(E);
+        // Use InclusiveOr, the precedence just after '&&' to not parse the
+        // next arguments to the logical and.
+        E = ParseRHSOfBinaryExpression(E, prec::InclusiveOr);
+        if (!E.isInvalid())
+          Diag(E.get()->getExprLoc(),
+               Note
+               ? diag::note_unparenthesized_non_primary_expr_in_requires_clause
+               : diag::err_unparenthesized_non_primary_expr_in_requires_clause)
+               << FixItHint::CreateInsertion(E.get()->getBeginLoc(), "(")
+               << FixItHint::CreateInsertion(
+                   PP.getLocForEndOfToken(E.get()->getEndLoc()), ")")
+               << E.get()->getSourceRange();
+        return E;
+    };
+
+    if (NotPrimaryExpression ||
+        // Check if the following tokens must be a part of a non-primary
+        // expression
+        getBinOpPrecedence(Tok.getKind(), GreaterThanIsOperator,
+                           /*CPlusPlus11=*/true) > prec::LogicalAnd ||
+        // Postfix operators other than '(' (which will be checked for in
+        // CheckConstraintExpression).
+        Tok.isOneOf(tok::period, tok::plusplus, tok::minusminus) ||
+        (Tok.is(tok::l_square) && !NextToken().is(tok::l_square))) {
+      E = RecoverFromNonPrimary(E, /*Note=*/false);
+      if (E.isInvalid())
+        return ExprError();
+      NotPrimaryExpression = false;
+    }
+    bool PossibleNonPrimary;
+    bool IsConstraintExpr =
+        Actions.CheckConstraintExpression(E.get(), Tok, &PossibleNonPrimary,
+                                          IsTrailingRequiresClause);
+    if (!IsConstraintExpr || PossibleNonPrimary) {
+      // Atomic constraint might be an unparenthesized non-primary expression
+      // (such as a binary operator), in which case we might get here (e.g. in
+      // 'requires 0 + 1 && true' we would now be at '+', and parse and ignore
+      // the rest of the addition expression). Try to parse the rest of it here.
+      if (PossibleNonPrimary)
+        E = RecoverFromNonPrimary(E, /*Note=*/!IsConstraintExpr);
+      Actions.CorrectDelayedTyposInExpr(E);
+      return ExprError();
+    }
+    return E;
+  };
+  ExprResult LHS = ParsePrimary();
+  if (LHS.isInvalid())
+    return ExprError();
+  while (Tok.is(tok::ampamp)) {
+    SourceLocation LogicalAndLoc = ConsumeToken();
+    ExprResult RHS = ParsePrimary();
+    if (RHS.isInvalid()) {
+      Actions.CorrectDelayedTyposInExpr(LHS);
+      return ExprError();
+    }
+    ExprResult Op = Actions.ActOnBinOp(getCurScope(), LogicalAndLoc,
+                                       tok::ampamp, LHS.get(), RHS.get());
+    if (!Op.isUsable()) {
+      Actions.CorrectDelayedTyposInExpr(RHS);
+      Actions.CorrectDelayedTyposInExpr(LHS);
+      return ExprError();
+    }
+    LHS = Op;
+  }
+  return LHS;
+}
+
+/// \brief Parse a constraint-logical-or-expression.
+///
+/// \verbatim
+///       C++2a[temp.constr.decl]p1
+///       constraint-logical-or-expression:
+///         constraint-logical-and-expression
+///         constraint-logical-or-expression '||'
+///             constraint-logical-and-expression
+///
+/// \endverbatim
+ExprResult
+Parser::ParseConstraintLogicalOrExpression(bool IsTrailingRequiresClause) {
+  ExprResult LHS(ParseConstraintLogicalAndExpression(IsTrailingRequiresClause));
+  if (!LHS.isUsable())
+    return ExprError();
+  while (Tok.is(tok::pipepipe)) {
+    SourceLocation LogicalOrLoc = ConsumeToken();
+    ExprResult RHS =
+        ParseConstraintLogicalAndExpression(IsTrailingRequiresClause);
+    if (!RHS.isUsable()) {
+      Actions.CorrectDelayedTyposInExpr(LHS);
+      return ExprError();
+    }
+    ExprResult Op = Actions.ActOnBinOp(getCurScope(), LogicalOrLoc,
+                                       tok::pipepipe, LHS.get(), RHS.get());
+    if (!Op.isUsable()) {
+      Actions.CorrectDelayedTyposInExpr(RHS);
+      Actions.CorrectDelayedTyposInExpr(LHS);
+      return ExprError();
+    }
+    LHS = Op;
+  }
+  return LHS;
+}
+
 bool Parser::isNotExpressionStart() {
   tok::TokenKind K = Tok.getKind();
   if (K == tok::l_brace || K == tok::r_brace  ||
@@ -414,7 +545,7 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
     } else if (getLangOpts().CPlusPlus && NextTokPrec <= prec::Conditional)
       RHS = ParseAssignmentExpression();
     else
-      RHS = ParseCastExpression(false);
+      RHS = ParseCastExpression(AnyCastExpr);
 
     if (RHS.isInvalid()) {
       // FIXME: Errors generated by the delayed typo correction should be
@@ -519,22 +650,24 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
   }
 }
 
-/// Parse a cast-expression, or, if \p isUnaryExpression is true,
-/// parse a unary-expression.
+/// Parse a cast-expression, unary-expression or primary-expression, based
+/// on \p ExprType.
 ///
 /// \p isAddressOfOperand exists because an id-expression that is the
 /// operand of address-of gets special treatment due to member pointers.
 ///
-ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
+ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
                                        bool isAddressOfOperand,
                                        TypeCastState isTypeCast,
-                                       bool isVectorLiteral) {
+                                       bool isVectorLiteral,
+                                       bool *NotPrimaryExpression) {
   bool NotCastExpr;
-  ExprResult Res = ParseCastExpression(isUnaryExpression,
+  ExprResult Res = ParseCastExpression(ParseKind,
                                        isAddressOfOperand,
                                        NotCastExpr,
                                        isTypeCast,
-                                       isVectorLiteral);
+                                       isVectorLiteral,
+                                       NotPrimaryExpression);
   if (NotCastExpr)
     Diag(Tok, diag::err_expected_expression);
   return Res;
@@ -759,11 +892,12 @@ class CastExpressionIdValidator final : public CorrectionCandidateCallback {
 ///                   '__is_rvalue_expr'
 /// \endverbatim
 ///
-ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
+ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
                                        bool isAddressOfOperand,
                                        bool &NotCastExpr,
                                        TypeCastState isTypeCast,
-                                       bool isVectorLiteral) {
+                                       bool isVectorLiteral,
+                                       bool *NotPrimaryExpression) {
   ExprResult Res;
   tok::TokenKind SavedKind = Tok.getKind();
   auto SavedType = PreferredType;
@@ -782,11 +916,21 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   // ParsePostfixExpressionSuffix.
   switch (SavedKind) {
   case tok::l_paren: {
-    // If this expression is limited to being a unary-expression, the parent can
+    // If this expression is limited to being a unary-expression, the paren can
     // not start a cast expression.
-    ParenParseOption ParenExprType =
-        (isUnaryExpression && !getLangOpts().CPlusPlus) ? CompoundLiteral
-                                                        : CastExpr;
+    ParenParseOption ParenExprType;
+    switch (ParseKind) {
+      case CastParseKind::UnaryExprOnly:
+        if (!getLangOpts().CPlusPlus)
+          ParenExprType = CompoundLiteral;
+        LLVM_FALLTHROUGH;
+      case CastParseKind::AnyCastExpr:
+        ParenExprType = ParenParseOption::CastExpr;
+        break;
+      case CastParseKind::PrimaryExprOnly:
+        ParenExprType = FoldExpr;
+        break;
+    }
     ParsedType CastTy;
     SourceLocation RParenLoc;
     Res = ParseParenExpression(ParenExprType, false/*stopIfCastExr*/,
@@ -861,8 +1005,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     if (TryAnnotateTypeOrScopeToken())
       return ExprError();
     assert(Tok.isNot(tok::kw_decltype) && Tok.isNot(tok::kw___super));
-    return ParseCastExpression(isUnaryExpression, isAddressOfOperand);
-
+    return ParseCastExpression(ParseKind, isAddressOfOperand, isTypeCast,
+                               isVectorLiteral, NotPrimaryExpression);
+      
   case tok::identifier: {      // primary-expression: identifier
                                // unqualified-id: identifier
                                // constant: enumeration-constant
@@ -949,8 +1094,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
           = RevertibleTypeTraits.find(II);
         if (Known != RevertibleTypeTraits.end()) {
           Tok.setKind(Known->second);
-          return ParseCastExpression(isUnaryExpression, isAddressOfOperand,
-                                     NotCastExpr, isTypeCast);
+          return ParseCastExpression(ParseKind, isAddressOfOperand,
+                                     NotCastExpr, isTypeCast,
+                                     isVectorLiteral, NotPrimaryExpression);
         }
       }
 
@@ -961,7 +1107,10 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
         if (TryAnnotateTypeOrScopeToken())
           return ExprError();
         if (!Tok.is(tok::identifier))
-          return ParseCastExpression(isUnaryExpression, isAddressOfOperand);
+          return ParseCastExpression(ParseKind, isAddressOfOperand,
+                                     NotCastExpr, isTypeCast,
+                                     isVectorLiteral,
+                                     NotPrimaryExpression);
       }
     }
 
@@ -1076,8 +1225,10 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
         Tok.is(tok::r_paren) ? nullptr : &Replacement);
     if (!Res.isInvalid() && Res.isUnset()) {
       UnconsumeToken(Replacement);
-      return ParseCastExpression(isUnaryExpression, isAddressOfOperand,
-                                 NotCastExpr, isTypeCast);
+      return ParseCastExpression(ParseKind, isAddressOfOperand,
+                                 NotCastExpr, isTypeCast,
+                                 /*isVectorLiteral=*/false,
+                                 NotPrimaryExpression);
     }
     if (!Res.isInvalid() && Tok.is(tok::less))
       checkPotentialAngleBracket(Res);
@@ -1122,12 +1273,16 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   case tok::kw___builtin_FILE:
   case tok::kw___builtin_FUNCTION:
   case tok::kw___builtin_LINE:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseBuiltinPrimaryExpression();
   case tok::kw___null:
     return Actions.ActOnGNUNullExpr(ConsumeToken());
 
   case tok::plusplus:      // unary-expression: '++' unary-expression [C99]
   case tok::minusminus: {  // unary-expression: '--' unary-expression [C99]
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     // C++ [expr.unary] has:
     //   unary-expression:
     //     ++ cast-expression
@@ -1140,7 +1295,8 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     // One special case is implicitly handled here: if the preceding tokens are
     // an ambiguous cast expression, such as "(T())++", then we recurse to
     // determine whether the '++' is prefix or postfix.
-    Res = ParseCastExpression(!getLangOpts().CPlusPlus,
+    Res = ParseCastExpression(getLangOpts().CPlusPlus ?
+                                  UnaryExprOnly : AnyCastExpr,
                               /*isAddressOfOperand*/false, NotCastExpr,
                               NotTypeCast);
     if (NotCastExpr) {
@@ -1156,10 +1312,12 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     return Res;
   }
   case tok::amp: {         // unary-expression: '&' cast-expression
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     // Special treatment because of member pointers
     SourceLocation SavedLoc = ConsumeToken();
     PreferredType.enterUnary(Actions, Tok.getLocation(), tok::amp, SavedLoc);
-    Res = ParseCastExpression(false, true);
+    Res = ParseCastExpression(AnyCastExpr, true);
     if (!Res.isInvalid())
       Res = Actions.ActOnUnaryOp(getCurScope(), SavedLoc, SavedKind, Res.get());
     return Res;
@@ -1172,17 +1330,21 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   case tok::exclaim:       // unary-expression: '!' cast-expression
   case tok::kw___real:     // unary-expression: '__real' cast-expression [GNU]
   case tok::kw___imag: {   // unary-expression: '__imag' cast-expression [GNU]
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     SourceLocation SavedLoc = ConsumeToken();
     PreferredType.enterUnary(Actions, Tok.getLocation(), SavedKind, SavedLoc);
-    Res = ParseCastExpression(false);
+    Res = ParseCastExpression(AnyCastExpr);
     if (!Res.isInvalid())
       Res = Actions.ActOnUnaryOp(getCurScope(), SavedLoc, SavedKind, Res.get());
     return Res;
   }
 
   case tok::kw_co_await: {  // unary-expression: 'co_await' cast-expression
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     SourceLocation CoawaitLoc = ConsumeToken();
-    Res = ParseCastExpression(false);
+    Res = ParseCastExpression(AnyCastExpr);
     if (!Res.isInvalid())
       Res = Actions.ActOnCoawaitExpr(getCurScope(), CoawaitLoc, Res.get());
     return Res;
@@ -1190,9 +1352,11 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
 
   case tok::kw___extension__:{//unary-expression:'__extension__' cast-expr [GNU]
     // __extension__ silences extension warnings in the subexpression.
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     ExtensionRAIIObject O(Diags);  // Use RAII to do this.
     SourceLocation SavedLoc = ConsumeToken();
-    Res = ParseCastExpression(false);
+    Res = ParseCastExpression(AnyCastExpr);
     if (!Res.isInvalid())
       Res = Actions.ActOnUnaryOp(getCurScope(), SavedLoc, SavedKind, Res.get());
     return Res;
@@ -1209,8 +1373,12 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   case tok::kw_vec_step:   // unary-expression: OpenCL 'vec_step' expression
   // unary-expression: '__builtin_omp_required_simd_align' '(' type-name ')'
   case tok::kw___builtin_omp_required_simd_align:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseUnaryExprOrTypeTraitExpression();
   case tok::ampamp: {      // unary-expression: '&&' identifier
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     SourceLocation AmpAmpLoc = ConsumeToken();
     if (Tok.isNot(tok::identifier))
       return ExprError(Diag(Tok, diag::err_expected) << tok::identifier);
@@ -1229,18 +1397,26 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   case tok::kw_dynamic_cast:
   case tok::kw_reinterpret_cast:
   case tok::kw_static_cast:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     Res = ParseCXXCasts();
     break;
   case tok::kw___builtin_bit_cast:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     Res = ParseBuiltinBitCast();
     break;
   case tok::kw_typeid:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     Res = ParseCXXTypeid();
     break;
   case tok::kw___unique_stable_name:
     Res = ParseUniqueStableNameExpression();
     break;
   case tok::kw___uuidof:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     Res = ParseCXXUuidof();
     break;
   case tok::kw_this:
@@ -1305,6 +1481,10 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
       return ExprError();
     }
 
+    // Everything henceforth is a postfix-expression.
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
+
     if (SavedKind == tok::kw_typename) {
       // postfix-expression: typename-specifier '(' expression-list[opt] ')'
       //                     typename-specifier braced-init-list
@@ -1341,8 +1521,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     if (TryAnnotateTypeOrScopeToken())
       return ExprError();
     if (!Tok.is(tok::annot_cxxscope))
-      return ParseCastExpression(isUnaryExpression, isAddressOfOperand,
-                                 NotCastExpr, isTypeCast);
+      return ParseCastExpression(ParseKind, isAddressOfOperand, NotCastExpr,
+                                 isTypeCast, isVectorLiteral,
+                                 NotPrimaryExpression);
 
     Token Next = NextToken();
     if (Next.is(tok::annot_template_id)) {
@@ -1355,8 +1536,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
         ParseOptionalCXXScopeSpecifier(SS, nullptr,
                                        /*EnteringContext=*/false);
         AnnotateTemplateIdTokenAsType();
-        return ParseCastExpression(isUnaryExpression, isAddressOfOperand,
-                                   NotCastExpr, isTypeCast);
+        return ParseCastExpression(ParseKind, isAddressOfOperand, NotCastExpr,
+                                   isTypeCast, isVectorLiteral,
+                                   NotPrimaryExpression);
       }
     }
 
@@ -1372,8 +1554,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
       // translate it into a type and continue parsing as a cast
       // expression.
       AnnotateTemplateIdTokenAsType();
-      return ParseCastExpression(isUnaryExpression, isAddressOfOperand,
-                                 NotCastExpr, isTypeCast);
+      return ParseCastExpression(ParseKind, isAddressOfOperand,
+                                 NotCastExpr, isTypeCast, isVectorLiteral,
+                                 NotPrimaryExpression);
     }
 
     // Fall through to treat the template-id as an id-expression.
@@ -1390,15 +1573,22 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     if (TryAnnotateTypeOrScopeToken())
       return ExprError();
     if (!Tok.is(tok::coloncolon))
-      return ParseCastExpression(isUnaryExpression, isAddressOfOperand);
+      return ParseCastExpression(ParseKind, isAddressOfOperand, isTypeCast,
+                                 isVectorLiteral, NotPrimaryExpression);
 
     // ::new -> [C++] new-expression
     // ::delete -> [C++] delete-expression
     SourceLocation CCLoc = ConsumeToken();
-    if (Tok.is(tok::kw_new))
+    if (Tok.is(tok::kw_new)) {
+      if (NotPrimaryExpression)
+        *NotPrimaryExpression = true;
       return ParseCXXNewExpression(true, CCLoc);
-    if (Tok.is(tok::kw_delete))
+    }
+    if (Tok.is(tok::kw_delete)) {
+      if (NotPrimaryExpression)
+        *NotPrimaryExpression = true;
       return ParseCXXDeleteExpression(true, CCLoc);
+    }
 
     // This is not a type name or scope specifier, it is an invalid expression.
     Diag(CCLoc, diag::err_expected_expression);
@@ -1406,12 +1596,18 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   }
 
   case tok::kw_new: // [C++] new-expression
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseCXXNewExpression(false, Tok.getLocation());
 
   case tok::kw_delete: // [C++] delete-expression
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseCXXDeleteExpression(false, Tok.getLocation());
 
   case tok::kw_noexcept: { // [C++0x] 'noexcept' '(' expression ')'
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     Diag(Tok, diag::warn_cxx98_compat_noexcept_expr);
     SourceLocation KeyLoc = ConsumeToken();
     BalancedDelimiterTracker T(*this, tok::l_paren);
@@ -1440,13 +1636,19 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
 
   case tok::kw___array_rank:
   case tok::kw___array_extent:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseArrayTypeTrait();
 
   case tok::kw___is_lvalue_expr:
   case tok::kw___is_rvalue_expr:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseExpressionTrait();
 
   case tok::at: {
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     SourceLocation AtLoc = ConsumeToken();
     return ParseObjCAtExpression(AtLoc);
   }
@@ -1468,8 +1670,13 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
         // expression, or we have something that doesn't appear to be a lambda.
         // If we're in the last case, we fall back to ParseObjCMessageExpression.
         Res = TryParseLambdaExpression();
-        if (!Res.isInvalid() && !Res.get())
+        if (!Res.isInvalid() && !Res.get()) {
+          // We assume Objective-C++ message expressions are not
+          // primary-expressions.
+          if (NotPrimaryExpression)
+            *NotPrimaryExpression = true;
           Res = ParseObjCMessageExpression();
+        }
         break;
       }
       Res = ParseLambdaExpression();
@@ -1489,6 +1696,11 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   // are compiling for OpenCL, we need to return an error as this implies
   // that the address of the function is being taken, which is illegal in CL.
 
+  if (ParseKind == PrimaryExprOnly)
+    // This is strictly a primary-expression - no postfix-expr pieces should be
+    // parsed.
+    return Res;
+
   // These can be followed by postfix-expr pieces.
   PreferredType = SavedType;
   Res = ParsePostfixExpressionSuffix(Res);
@@ -1932,7 +2144,7 @@ Parser::ParseExprAfterUnaryExprOrTypeTrait(const Token &OpTok,
       return ExprError();
     }
 
-    Operand = ParseCastExpression(true/*isUnaryExpression*/);
+    Operand = ParseCastExpression(UnaryExprOnly);
   } else {
     // If it starts with a '(', we know that it is either a parenthesized
     // type-name, or it is a unary-expression that starts with a compound
@@ -2519,8 +2731,8 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
     RParenLoc = T.getCloseLocation();
 
     PreferredType.enterTypeCast(Tok.getLocation(), Ty.get().get());
-    ExprResult SubExpr = ParseCastExpression(/*isUnaryExpression=*/false);
-
+    ExprResult SubExpr = ParseCastExpression(AnyCastExpr);
+    
     if (Ty.isInvalid() || SubExpr.isInvalid())
       return ExprError();
 
@@ -2600,7 +2812,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
             // Parse the cast-expression that follows it next.
             // isVectorLiteral = true will make sure we don't parse any
             // Postfix expression yet
-            Result = ParseCastExpression(/*isUnaryExpression=*/false,
+            Result = ParseCastExpression(/*isUnaryExpression=*/AnyCastExpr,
                                          /*isAddressOfOperand=*/false,
                                          /*isTypeCast=*/IsTypeCast,
                                          /*isVectorLiteral=*/true);
@@ -2652,7 +2864,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
         PreferredType.enterTypeCast(Tok.getLocation(), CastTy.get());
         // Parse the cast-expression that follows it next.
         // TODO: For cast expression with CastTy.
-        Result = ParseCastExpression(/*isUnaryExpression=*/false,
+        Result = ParseCastExpression(/*isUnaryExpression=*/AnyCastExpr,
                                      /*isAddressOfOperand=*/false,
                                      /*isTypeCast=*/IsTypeCast);
         if (!Result.isInvalid()) {
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index a39998482e956..f4ffa08b2a1b7 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -1371,10 +1371,6 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
         DeclEndLoc = Range.getEnd();
     }
 
-    PrototypeScope.Exit();
-
-    WarnIfHasCUDATargetAttr();
-
     SourceLocation NoLoc;
     D.AddTypeInfo(DeclaratorChunk::getFunction(
                       /*HasProto=*/true,
@@ -1389,13 +1385,22 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
                       /*DeclsInPrototype=*/None, LParenLoc, FunLocalRangeEnd, D,
                       TrailingReturnType, &DS),
                   std::move(Attr), DeclEndLoc);
+
+    // Parse requires-clause[opt].
+    if (Tok.is(tok::kw_requires))
+      ParseTrailingRequiresClause(D);
+
+    PrototypeScope.Exit();
+
+    WarnIfHasCUDATargetAttr();
   } else if (Tok.isOneOf(tok::kw_mutable, tok::arrow, tok::kw___attribute,
                          tok::kw_constexpr, tok::kw_consteval,
                          tok::kw___private, tok::kw___global, tok::kw___local,
-                         tok::kw___constant, tok::kw___generic) ||
+                         tok::kw___constant, tok::kw___generic,
+                         tok::kw_requires) ||
              (Tok.is(tok::l_square) && NextToken().is(tok::l_square))) {
     // It's common to forget that one needs '()' before 'mutable', an attribute
-    // specifier, or the result type. Deal with this.
+    // specifier, the result type, or the requires clause. Deal with this.
     unsigned TokKind = 0;
     switch (Tok.getKind()) {
     case tok::kw_mutable: TokKind = 0; break;
@@ -1409,6 +1414,7 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
     case tok::l_square: TokKind = 2; break;
     case tok::kw_constexpr: TokKind = 3; break;
     case tok::kw_consteval: TokKind = 4; break;
+    case tok::kw_requires: TokKind = 5; break;
     default: llvm_unreachable("Unknown token kind");
     }
 
@@ -1440,8 +1446,6 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
         DeclEndLoc = Range.getEnd();
     }
 
-    WarnIfHasCUDATargetAttr();
-
     SourceLocation NoLoc;
     D.AddTypeInfo(DeclaratorChunk::getFunction(
                       /*HasProto=*/true,
@@ -1462,6 +1466,12 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
                       /*DeclsInPrototype=*/None, DeclLoc, DeclEndLoc, D,
                       TrailingReturnType),
                   std::move(Attr), DeclEndLoc);
+
+    // Parse the requires-clause, if present.
+    if (Tok.is(tok::kw_requires))
+      ParseTrailingRequiresClause(D);
+
+    WarnIfHasCUDATargetAttr();
   }
 
   // FIXME: Rename BlockScope -> ClosureScope if we decide to continue using
@@ -3238,7 +3248,7 @@ Parser::ParseCXXDeleteExpression(bool UseGlobal, SourceLocation Start) {
       return ExprError();
   }
 
-  ExprResult Operand(ParseCastExpression(false));
+  ExprResult Operand(ParseCastExpression(AnyCastExpr));
   if (Operand.isInvalid())
     return Operand;
 
@@ -3469,7 +3479,7 @@ Parser::ParseCXXAmbiguousParenExpression(ParenParseOption &ExprType,
       // If it is not a cast-expression, NotCastExpr will be true and no token
       // will be consumed.
       ColonProt.restore();
-      Result = ParseCastExpression(false/*isUnaryExpression*/,
+      Result = ParseCastExpression(AnyCastExpr,
                                    false/*isAddressofOperand*/,
                                    NotCastExpr,
                                    // type-id has priority.
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 24855df334f4c..1095919baa7d3 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2264,8 +2264,8 @@ ExprResult Parser::ParseOpenMPParensExpr(StringRef ClauseName,
     return ExprError();
 
   SourceLocation ELoc = Tok.getLocation();
-  ExprResult LHS(ParseCastExpression(
-      /*isUnaryExpression=*/false, IsAddressOfOperand, NotTypeCast));
+  ExprResult LHS(ParseCastExpression(AnyCastExpr, IsAddressOfOperand,
+                                     NotTypeCast));
   ExprResult Val(ParseRHSOfBinaryExpression(LHS, prec::Conditional));
   Val = Actions.ActOnFinishFullExpr(Val.get(), ELoc, /*DiscardedValue*/ false);
 
@@ -2513,7 +2513,7 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind,
                           Kind == OMPC_if;
   if (NeedAnExpression) {
     SourceLocation ELoc = Tok.getLocation();
-    ExprResult LHS(ParseCastExpression(false, false, NotTypeCast));
+    ExprResult LHS(ParseCastExpression(AnyCastExpr, false, NotTypeCast));
     Val = ParseRHSOfBinaryExpression(LHS, prec::Conditional);
     Val =
         Actions.ActOnFinishFullExpr(Val.get(), ELoc, /*DiscardedValue*/ false);
diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp
index 928bc5aa25b35..35cee596bb016 100644
--- a/clang/lib/Parse/ParseTemplate.cpp
+++ b/clang/lib/Parse/ParseTemplate.cpp
@@ -130,7 +130,9 @@ Decl *Parser::ParseTemplateDeclarationOrSpecialization(
 
       if (TryConsumeToken(tok::kw_requires)) {
         OptionalRequiresClauseConstraintER =
-            Actions.CorrectDelayedTyposInExpr(ParseConstraintExpression());
+            Actions.CorrectDelayedTyposInExpr(
+                ParseConstraintLogicalOrExpression(
+                    /*IsTrailingRequiresClause=*/false));
         if (!OptionalRequiresClauseConstraintER.isUsable()) {
           // Skip until the semi-colon or a '}'.
           SkipUntil(tok::r_brace, StopAtSemi | StopBeforeMatch);
@@ -254,8 +256,12 @@ Decl *Parser::ParseSingleDeclarationAfterTemplate(
   });
 
   LateParsedAttrList LateParsedAttrs(true);
-  if (DeclaratorInfo.isFunctionDeclarator())
+  if (DeclaratorInfo.isFunctionDeclarator()) {
+    if (Tok.is(tok::kw_requires))
+      ParseTrailingRequiresClause(DeclaratorInfo);
+
     MaybeParseGNUAttributes(DeclaratorInfo, &LateParsedAttrs);
+  }
 
   if (DeclaratorInfo.isFunctionDeclarator() &&
       isStartOfFunctionDefinition(DeclaratorInfo)) {
diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp
index 418729a4b2658..9ea800c89a775 100644
--- a/clang/lib/Parse/ParseTentative.cpp
+++ b/clang/lib/Parse/ParseTentative.cpp
@@ -1031,6 +1031,10 @@ Parser::TPResult Parser::TryParseDeclarator(bool mayBeAbstract,
       // direct-declarator '[' constant-expression[opt] ']'
       // direct-abstract-declarator[opt] '[' constant-expression[opt] ']'
       TPR = TryParseBracketDeclarator();
+    } else if (Tok.is(tok::kw_requires)) {
+      // declarator requires-clause
+      // A requires clause indicates a function declaration.
+      TPR = TPResult::True;
     } else {
       break;
     }
@@ -2015,7 +2019,6 @@ Parser::TryParseParameterDeclarationClause(bool *InvalidAsDeclaration,
 ///   'throw' '(' type-id-list[opt] ')'
 ///
 Parser::TPResult Parser::TryParseFunctionDeclarator() {
-
   // The '(' is already parsed.
 
   TPResult TPR = TryParseParameterDeclarationClause();
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 6216206690b0c..a905ebc673056 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -2008,7 +2008,7 @@ static bool fixOverloadedReinterpretCastExpr(Sema &Self, QualType DestType,
   // No guarantees that ResolveAndFixSingleFunctionTemplateSpecialization
   // preserves Result.
   Result = E;
-  if (!Self.resolveAndFixAddressOfOnlyViableOverloadCandidate(
+  if (!Self.resolveAndFixAddressOfSingleOverloadCandidate(
           Result, /*DoFunctionPointerConversion=*/true))
     return false;
   return Result.isUsable();
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 6c54fd1915771..682d2ebf97689 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -340,7 +340,8 @@ void Sema::checkFortifiedBuiltinMemoryFunction(FunctionDecl *FD,
   case Builtin::BI__builtin___strncat_chk:
   case Builtin::BI__builtin___strncpy_chk:
   case Builtin::BI__builtin___stpncpy_chk:
-  case Builtin::BI__builtin___memccpy_chk: {
+  case Builtin::BI__builtin___memccpy_chk:
+  case Builtin::BI__builtin___mempcpy_chk: {
     DiagID = diag::warn_builtin_chk_overflow;
     IsChkVariant = true;
     SizeIndex = TheCall->getNumArgs() - 2;
@@ -379,7 +380,9 @@ void Sema::checkFortifiedBuiltinMemoryFunction(FunctionDecl *FD,
   case Builtin::BImemmove:
   case Builtin::BI__builtin_memmove:
   case Builtin::BImemset:
-  case Builtin::BI__builtin_memset: {
+  case Builtin::BI__builtin_memset:
+  case Builtin::BImempcpy:
+  case Builtin::BI__builtin_mempcpy: {
     DiagID = diag::warn_fortify_source_overflow;
     SizeIndex = TheCall->getNumArgs() - 1;
     ObjectIndex = 0;
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 7f0bdc9b47822..018ac2d7dc9d1 100755
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Sema/SemaConcept.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaDiagnostic.h"
@@ -18,12 +19,16 @@
 #include "clang/Sema/Template.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Basic/OperatorPrecedence.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
 using namespace clang;
 using namespace sema;
 
-bool Sema::CheckConstraintExpression(Expr *ConstraintExpression) {
+bool
+Sema::CheckConstraintExpression(Expr *ConstraintExpression, Token NextToken,
+                                bool *PossibleNonPrimary,
+                                bool IsTrailingRequiresClause) {
   // C++2a [temp.constr.atomic]p1
   // ..E shall be a constant expression of type bool.
 
@@ -31,22 +36,56 @@ bool Sema::CheckConstraintExpression(Expr *ConstraintExpression) {
 
   if (auto *BinOp = dyn_cast<BinaryOperator>(ConstraintExpression)) {
     if (BinOp->getOpcode() == BO_LAnd || BinOp->getOpcode() == BO_LOr)
-      return CheckConstraintExpression(BinOp->getLHS()) &&
-             CheckConstraintExpression(BinOp->getRHS());
+      return CheckConstraintExpression(BinOp->getLHS(), NextToken,
+                                       PossibleNonPrimary) &&
+             CheckConstraintExpression(BinOp->getRHS(), NextToken,
+                                       PossibleNonPrimary);
   } else if (auto *C = dyn_cast<ExprWithCleanups>(ConstraintExpression))
-    return CheckConstraintExpression(C->getSubExpr());
+    return CheckConstraintExpression(C->getSubExpr(), NextToken,
+                                     PossibleNonPrimary);
+
+  QualType Type = ConstraintExpression->getType();
+
+  auto CheckForNonPrimary = [&] {
+    if (PossibleNonPrimary)
+      *PossibleNonPrimary =
+          // We have the following case:
+          // template<typename> requires func(0) struct S { };
+          // The user probably isn't aware of the parentheses required around
+          // the function call, and we're only going to parse 'func' as the
+          // primary-expression, and complain that it is of non-bool type.
+          (NextToken.is(tok::l_paren) &&
+           (IsTrailingRequiresClause ||
+            (Type->isDependentType() &&
+             IsDependentFunctionNameExpr(ConstraintExpression)) ||
+            Type->isFunctionType() ||
+            Type->isSpecificBuiltinType(BuiltinType::Overload))) ||
+          // We have the following case:
+          // template<typename T> requires size_<T> == 0 struct S { };
+          // The user probably isn't aware of the parentheses required around
+          // the binary operator, and we're only going to parse 'func' as the
+          // first operand, and complain that it is of non-bool type.
+          getBinOpPrecedence(NextToken.getKind(),
+                             /*GreaterThanIsOperator=*/true,
+                             getLangOpts().CPlusPlus11) > prec::LogicalAnd;
+  };
 
   // An atomic constraint!
-  if (ConstraintExpression->isTypeDependent())
+  if (ConstraintExpression->isTypeDependent()) {
+    CheckForNonPrimary();
     return true;
+  }
 
-  QualType Type = ConstraintExpression->getType();
   if (!Context.hasSameUnqualifiedType(Type, Context.BoolTy)) {
     Diag(ConstraintExpression->getExprLoc(),
          diag::err_non_bool_atomic_constraint) << Type
         << ConstraintExpression->getSourceRange();
+    CheckForNonPrimary();
     return false;
   }
+
+  if (PossibleNonPrimary)
+      *PossibleNonPrimary = false;
   return true;
 }
 
@@ -417,123 +456,25 @@ void Sema::DiagnoseUnsatisfiedConstraint(
   }
 }
 
-namespace {
-struct AtomicConstraint {
-  const Expr *ConstraintExpr;
-  llvm::Optional<llvm::SmallVector<TemplateArgumentLoc, 3>> ParameterMapping;
-
-  AtomicConstraint(Sema &S, const Expr *ConstraintExpr) :
-      ConstraintExpr(ConstraintExpr) { };
-
-  bool hasMatchingParameterMapping(ASTContext &C,
-                                   const AtomicConstraint &Other) const {
-    if (!ParameterMapping != !Other.ParameterMapping)
-      return false;
-    if (!ParameterMapping)
-      return true;
-    if (ParameterMapping->size() != Other.ParameterMapping->size())
-      return false;
-
-    for (unsigned I = 0, S = ParameterMapping->size(); I < S; ++I)
-      if (!C.getCanonicalTemplateArgument((*ParameterMapping)[I].getArgument())
-               .structurallyEquals(C.getCanonicalTemplateArgument(
-                  (*Other.ParameterMapping)[I].getArgument())))
-        return false;
-    return true;
-  }
-
-  bool subsumes(ASTContext &C, const AtomicConstraint &Other) const {
-    // C++ [temp.constr.order] p2
-    //   - an atomic constraint A subsumes another atomic constraint B
-    //     if and only if the A and B are identical [...]
-    //
-    // C++ [temp.constr.atomic] p2
-    //   Two atomic constraints are identical if they are formed from the
-    //   same expression and the targets of the parameter mappings are
-    //   equivalent according to the rules for expressions [...]
-
-    // We do not actually substitute the parameter mappings into the
-    // constraint expressions, therefore the constraint expressions are
-    // the originals, and comparing them will suffice.
-    if (ConstraintExpr != Other.ConstraintExpr)
-      return false;
-
-    // Check that the parameter lists are identical
-    return hasMatchingParameterMapping(C, Other);
-  }
-};
-
-/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
-/// either an atomic constraint, a conjunction of normalized constraints or a
-/// disjunction of normalized constraints.
-struct NormalizedConstraint {
-  enum CompoundConstraintKind { CCK_Conjunction, CCK_Disjunction };
-
-  using CompoundConstraint = llvm::PointerIntPair<
-      std::pair<NormalizedConstraint, NormalizedConstraint> *, 1,
-      CompoundConstraintKind>;
-
-  llvm::PointerUnion<AtomicConstraint *, CompoundConstraint> Constraint;
-
-  NormalizedConstraint(AtomicConstraint *C): Constraint{C} { };
-  NormalizedConstraint(ASTContext &C, NormalizedConstraint LHS,
-                       NormalizedConstraint RHS, CompoundConstraintKind Kind)
-      : Constraint{CompoundConstraint{
-            new (C) std::pair<NormalizedConstraint, NormalizedConstraint>{LHS,
-                                                                          RHS},
-            Kind}} { };
-
-  CompoundConstraintKind getCompoundKind() const {
-    assert(!isAtomic() && "getCompoundKind called on atomic constraint.");
-    return Constraint.get<CompoundConstraint>().getInt();
-  }
-
-  bool isAtomic() const { return Constraint.is<AtomicConstraint *>(); }
-
-  NormalizedConstraint &getLHS() const {
-    assert(!isAtomic() && "getLHS called on atomic constraint.");
-    return Constraint.get<CompoundConstraint>().getPointer()->first;
-  }
-
-  NormalizedConstraint &getRHS() const {
-    assert(!isAtomic() && "getRHS called on atomic constraint.");
-    return Constraint.get<CompoundConstraint>().getPointer()->second;
+const NormalizedConstraint *
+Sema::getNormalizedAssociatedConstraints(
+    NamedDecl *ConstrainedDecl, ArrayRef<const Expr *> AssociatedConstraints) {
+  auto CacheEntry = NormalizationCache.find(ConstrainedDecl);
+  if (CacheEntry == NormalizationCache.end()) {
+    auto Normalized =
+        NormalizedConstraint::fromConstraintExprs(*this, ConstrainedDecl,
+                                                  AssociatedConstraints);
+    CacheEntry =
+        NormalizationCache
+            .try_emplace(ConstrainedDecl,
+                         Normalized
+                             ? new (Context) NormalizedConstraint(
+                                 std::move(*Normalized))
+                             : nullptr)
+            .first;
   }
-
-  AtomicConstraint *getAtomicConstraint() const {
-    assert(isAtomic() &&
-           "getAtomicConstraint called on non-atomic constraint.");
-    return Constraint.get<AtomicConstraint *>();
-  }
-
-  static llvm::Optional<NormalizedConstraint>
-  fromConstraintExprs(Sema &S, NamedDecl *D, ArrayRef<const Expr *> E) {
-    assert(E.size() != 0);
-    auto First = fromConstraintExpr(S, D, E[0]);
-    if (E.size() == 1)
-      return First;
-    auto Second = fromConstraintExpr(S, D, E[1]);
-    if (!Second)
-      return llvm::Optional<NormalizedConstraint>{};
-    llvm::Optional<NormalizedConstraint> Conjunction;
-    Conjunction.emplace(S.Context, std::move(*First), std::move(*Second),
-                        CCK_Conjunction);
-    for (unsigned I = 2; I < E.size(); ++I) {
-      auto Next = fromConstraintExpr(S, D, E[I]);
-      if (!Next)
-        return llvm::Optional<NormalizedConstraint>{};
-      NormalizedConstraint NewConjunction(S.Context, std::move(*Conjunction),
-                                          std::move(*Next), CCK_Conjunction);
-      *Conjunction = std::move(NewConjunction);
-    }
-    return Conjunction;
-  }
-
-private:
-  static llvm::Optional<NormalizedConstraint> fromConstraintExpr(Sema &S,
-                                                                 NamedDecl *D,
-                                                                 const Expr *E);
-};
+  return CacheEntry->second;
+}
 
 static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
     ConceptDecl *Concept, ArrayRef<TemplateArgument> TemplateArgs,
@@ -555,11 +496,13 @@ static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
     llvm::SmallBitVector OccurringIndices(TemplateParams->size());
     S.MarkUsedTemplateParameters(Atomic.ConstraintExpr, /*OnlyDeduced=*/false,
                                  /*Depth=*/0, OccurringIndices);
-    Atomic.ParameterMapping.emplace();
-    Atomic.ParameterMapping->reserve(OccurringIndices.size());
-    for (unsigned I = 0, C = TemplateParams->size(); I != C; ++I)
+    Atomic.ParameterMapping.emplace(
+        MutableArrayRef<TemplateArgumentLoc>(
+            new (S.Context) TemplateArgumentLoc[OccurringIndices.count()],
+            OccurringIndices.count()));
+    for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I)
       if (OccurringIndices[I])
-        Atomic.ParameterMapping->push_back(
+        new (&(*Atomic.ParameterMapping)[J++]) TemplateArgumentLoc(
             S.getIdentityTemplateArgumentLoc(TemplateParams->begin()[I],
                 // Here we assume we do not support things like
                 // template<typename A, typename B>
@@ -585,6 +528,30 @@ static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
   return false;
 }
 
+Optional<NormalizedConstraint>
+NormalizedConstraint::fromConstraintExprs(Sema &S, NamedDecl *D,
+                                          ArrayRef<const Expr *> E) {
+  assert(E.size() != 0);
+  auto First = fromConstraintExpr(S, D, E[0]);
+  if (E.size() == 1)
+    return First;
+  auto Second = fromConstraintExpr(S, D, E[1]);
+  if (!Second)
+    return None;
+  llvm::Optional<NormalizedConstraint> Conjunction;
+  Conjunction.emplace(S.Context, std::move(*First), std::move(*Second),
+                      CCK_Conjunction);
+  for (unsigned I = 2; I < E.size(); ++I) {
+    auto Next = fromConstraintExpr(S, D, E[I]);
+    if (!Next)
+      return llvm::Optional<NormalizedConstraint>{};
+    NormalizedConstraint NewConjunction(S.Context, std::move(*Conjunction),
+                                        std::move(*Next), CCK_Conjunction);
+    *Conjunction = std::move(NewConjunction);
+  }
+  return Conjunction;
+}
+
 llvm::Optional<NormalizedConstraint>
 NormalizedConstraint::fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E) {
   assert(E != nullptr);
@@ -604,11 +571,11 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E) {
         return None;
 
       return NormalizedConstraint(
-          S.Context, *LHS, *RHS,
+          S.Context, std::move(*LHS), std::move(*RHS),
           BO->getOpcode() == BO_LAnd ? CCK_Conjunction : CCK_Disjunction);
     }
   } else if (auto *CSE = dyn_cast<const ConceptSpecializationExpr>(E)) {
-    Optional<NormalizedConstraint> SubNF;
+    const NormalizedConstraint *SubNF;
     {
       Sema::InstantiatingTemplate Inst(
           S, CSE->getExprLoc(),
@@ -623,24 +590,26 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E) {
       // constraint. If any such substitution results in an invalid type or
       // expression, the program is ill-formed; no diagnostic is required.
       // [...]
-      SubNF = fromConstraintExpr(S, CSE->getNamedConcept(),
-                                 CSE->getNamedConcept()->getConstraintExpr());
+      ConceptDecl *CD = CSE->getNamedConcept();
+      SubNF = S.getNormalizedAssociatedConstraints(CD,
+                                                   {CD->getConstraintExpr()});
       if (!SubNF)
         return None;
     }
 
+    Optional<NormalizedConstraint> New;
+    New.emplace(S.Context, *SubNF);
+
     if (substituteParameterMappings(
-            S, *SubNF, CSE->getNamedConcept(),
+            S, *New, CSE->getNamedConcept(),
             CSE->getTemplateArguments(), CSE->getTemplateArgsAsWritten()))
       return None;
 
-    return SubNF;
+    return New;
   }
   return NormalizedConstraint{new (S.Context) AtomicConstraint(S, E)};
 }
 
-} // namespace
-
 using NormalForm =
     llvm::SmallVector<llvm::SmallVector<AtomicConstraint *, 2>, 4>;
 
@@ -703,22 +672,9 @@ static NormalForm makeDNF(const NormalizedConstraint &Normalized) {
   return Res;
 }
 
-static bool subsumes(Sema &S, NamedDecl *DP, ArrayRef<const Expr *> P,
-                     NamedDecl *DQ, ArrayRef<const Expr *> Q, bool &Subsumes) {
-  // C++ [temp.constr.order] p2
-  //   In order to determine if a constraint P subsumes a constraint Q, P is
-  //   transformed into disjunctive normal form, and Q is transformed into
-  //   conjunctive normal form. [...]
-  auto PNormalized = NormalizedConstraint::fromConstraintExprs(S, DP, P);
-  if (!PNormalized)
-    return true;
-  const NormalForm PDNF = makeDNF(*PNormalized);
-
-  auto QNormalized = NormalizedConstraint::fromConstraintExprs(S, DQ, Q);
-  if (!QNormalized)
-    return true;
-  const NormalForm QCNF = makeCNF(*QNormalized);
-
+template<typename AtomicSubsumptionEvaluator>
+static bool subsumes(NormalForm PDNF, NormalForm QCNF,
+                     AtomicSubsumptionEvaluator E) {
   // C++ [temp.constr.order] p2
   //   Then, P subsumes Q if and only if, for every disjunctive clause Pi in the
   //   disjunctive normal form of P, Pi subsumes every conjunctive clause Qj in
@@ -733,7 +689,7 @@ static bool subsumes(Sema &S, NamedDecl *DP, ArrayRef<const Expr *> P,
       bool Found = false;
       for (const AtomicConstraint *Pia : Pi) {
         for (const AtomicConstraint *Qjb : Qj) {
-          if (Pia->subsumes(S.Context, *Qjb)) {
+          if (E(*Pia, *Qjb)) {
             Found = true;
             break;
           }
@@ -741,13 +697,32 @@ static bool subsumes(Sema &S, NamedDecl *DP, ArrayRef<const Expr *> P,
         if (Found)
           break;
       }
-      if (!Found) {
-        Subsumes = false;
+      if (!Found)
         return false;
-      }
     }
   }
-  Subsumes = true;
+  return true;
+}
+
+template<typename AtomicSubsumptionEvaluator>
+static bool subsumes(Sema &S, NamedDecl *DP, ArrayRef<const Expr *> P,
+                     NamedDecl *DQ, ArrayRef<const Expr *> Q, bool &Subsumes,
+                     AtomicSubsumptionEvaluator E) {
+  // C++ [temp.constr.order] p2
+  //   In order to determine if a constraint P subsumes a constraint Q, P is
+  //   transformed into disjunctive normal form, and Q is transformed into
+  //   conjunctive normal form. [...]
+  auto *PNormalized = S.getNormalizedAssociatedConstraints(DP, P);
+  if (!PNormalized)
+    return true;
+  const NormalForm PDNF = makeDNF(*PNormalized);
+
+  auto *QNormalized = S.getNormalizedAssociatedConstraints(DQ, Q);
+  if (!QNormalized)
+    return true;
+  const NormalForm QCNF = makeCNF(*QNormalized);
+
+  Subsumes = subsumes(PDNF, QCNF, E);
   return false;
 }
 
@@ -770,8 +745,84 @@ bool Sema::IsAtLeastAsConstrained(NamedDecl *D1, ArrayRef<const Expr *> AC1,
     Result = CacheEntry->second;
     return false;
   }
-  if (subsumes(*this, D1, AC1, D2, AC2, Result))
+
+  if (subsumes(*this, D1, AC1, D2, AC2, Result,
+        [this] (const AtomicConstraint &A, const AtomicConstraint &B) {
+          return A.subsumes(Context, B);
+        }))
     return true;
   SubsumptionCache.try_emplace(Key, Result);
   return false;
-}
\ No newline at end of file
+}
+
+bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(NamedDecl *D1,
+    ArrayRef<const Expr *> AC1, NamedDecl *D2, ArrayRef<const Expr *> AC2) {
+  if (isSFINAEContext())
+    // No need to work here because our notes would be discarded.
+    return false;
+
+  if (AC1.empty() || AC2.empty())
+    return false;
+
+  auto NormalExprEvaluator =
+      [this] (const AtomicConstraint &A, const AtomicConstraint &B) {
+        return A.subsumes(Context, B);
+      };
+
+  const Expr *AmbiguousAtomic1 = nullptr, *AmbiguousAtomic2 = nullptr;
+  auto IdenticalExprEvaluator =
+      [&] (const AtomicConstraint &A, const AtomicConstraint &B) {
+        if (!A.hasMatchingParameterMapping(Context, B))
+          return false;
+        const Expr *EA = A.ConstraintExpr, *EB = B.ConstraintExpr;
+        if (EA == EB)
+          return true;
+
+        // Not the same source level expression - are the expressions
+        // identical?
+        llvm::FoldingSetNodeID IDA, IDB;
+        EA->Profile(IDA, Context, /*Cannonical=*/true);
+        EB->Profile(IDB, Context, /*Cannonical=*/true);
+        if (IDA != IDB)
+          return false;
+
+        AmbiguousAtomic1 = EA;
+        AmbiguousAtomic2 = EB;
+        return true;
+      };
+
+  {
+    // The subsumption checks might cause diagnostics
+    SFINAETrap Trap(*this);
+    auto *Normalized1 = getNormalizedAssociatedConstraints(D1, AC1);
+    if (!Normalized1)
+      return false;
+    const NormalForm DNF1 = makeDNF(*Normalized1);
+    const NormalForm CNF1 = makeCNF(*Normalized1);
+
+    auto *Normalized2 = getNormalizedAssociatedConstraints(D2, AC2);
+    if (!Normalized2)
+      return false;
+    const NormalForm DNF2 = makeDNF(*Normalized2);
+    const NormalForm CNF2 = makeCNF(*Normalized2);
+
+    bool Is1AtLeastAs2Normally = subsumes(DNF1, CNF2, NormalExprEvaluator);
+    bool Is2AtLeastAs1Normally = subsumes(DNF2, CNF1, NormalExprEvaluator);
+    bool Is1AtLeastAs2 = subsumes(DNF1, CNF2, IdenticalExprEvaluator);
+    bool Is2AtLeastAs1 = subsumes(DNF2, CNF1, IdenticalExprEvaluator);
+    if (Is1AtLeastAs2 == Is1AtLeastAs2Normally &&
+        Is2AtLeastAs1 == Is2AtLeastAs1Normally)
+      // Same result - no ambiguity was caused by identical atomic expressions.
+      return false;
+  }
+
+  // A different result! Some ambiguous atomic constraint(s) caused a difference
+  assert(AmbiguousAtomic1 && AmbiguousAtomic2);
+
+  Diag(AmbiguousAtomic1->getBeginLoc(), diag::note_ambiguous_atomic_constraints)
+      << AmbiguousAtomic1->getSourceRange();
+  Diag(AmbiguousAtomic2->getBeginLoc(),
+       diag::note_ambiguous_atomic_constraints_similar_expression)
+      << AmbiguousAtomic2->getSourceRange();
+  return true;
+}
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index 271c4a10f3e44..6dc9e342beb92 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -1228,7 +1228,7 @@ bool CoroutineStmtBuilder::makeNewAndDeleteExpr() {
     return false;
 
   if (RequiresNoThrowAlloc) {
-    const auto *FT = OperatorNew->getType()->getAs<FunctionProtoType>();
+    const auto *FT = OperatorNew->getType()->castAs<FunctionProtoType>();
     if (!FT->isNothrow(/*ResultIfDependent*/ false)) {
       S.Diag(OperatorNew->getLocation(),
              diag::err_coroutine_promise_new_requires_nothrow)
@@ -1281,7 +1281,7 @@ bool CoroutineStmtBuilder::makeNewAndDeleteExpr() {
 
   // Check if we need to pass the size.
   const auto *OpDeleteType =
-      OpDeleteQualType.getTypePtr()->getAs<FunctionProtoType>();
+      OpDeleteQualType.getTypePtr()->castAs<FunctionProtoType>();
   if (OpDeleteType->getNumParams() > 1)
     DeleteArgs.push_back(FrameSize);
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 51cc184f4d015..26b3a01b138c0 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -7902,7 +7902,13 @@ struct FindOverriddenMethod {
          Path.Decls = Path.Decls.slice(1)) {
       NamedDecl *D = Path.Decls.front();
       if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(D)) {
-        if (MD->isVirtual() && !S->IsOverload(Method, MD, false))
+        if (MD->isVirtual() &&
+            !S->IsOverload(
+                Method, MD, /*UseMemberUsingDeclRules=*/false,
+                /*ConsiderCudaAttrs=*/true,
+                // C++2a [class.virtual]p2 does not consider requires clauses
+                // when overriding.
+                /*ConsiderRequiresClauses=*/false))
           return true;
       }
     }
@@ -8247,7 +8253,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
 
     NewFD = FunctionDecl::Create(SemaRef.Context, DC, D.getBeginLoc(), NameInfo,
                                  R, TInfo, SC, isInline, HasPrototype,
-                                 CSK_unspecified);
+                                 CSK_unspecified,
+                                 /*TrailingRequiresClause=*/nullptr);
     if (D.isInvalidType())
       NewFD->setInvalidDecl();
 
@@ -8264,6 +8271,7 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
     ConstexprKind = CSK_unspecified;
     D.getMutableDeclSpec().ClearConstexprSpec();
   }
+  Expr *TrailingRequiresClause = D.getTrailingRequiresClause();
 
   // Check that the return type is not an abstract class type.
   // For record types, this is done by the AbstractClassUsageDiagnoser once
@@ -8283,7 +8291,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
     return CXXConstructorDecl::Create(
         SemaRef.Context, cast<CXXRecordDecl>(DC), D.getBeginLoc(), NameInfo, R,
         TInfo, ExplicitSpecifier, isInline,
-        /*isImplicitlyDeclared=*/false, ConstexprKind);
+        /*isImplicitlyDeclared=*/false, ConstexprKind, InheritedConstructor(),
+        TrailingRequiresClause);
 
   } else if (Name.getNameKind() == DeclarationName::CXXDestructorName) {
     // This is a C++ destructor declaration.
@@ -8292,8 +8301,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
       CXXRecordDecl *Record = cast<CXXRecordDecl>(DC);
       CXXDestructorDecl *NewDD = CXXDestructorDecl::Create(
           SemaRef.Context, Record, D.getBeginLoc(), NameInfo, R, TInfo,
-          isInline,
-          /*isImplicitlyDeclared=*/false, ConstexprKind);
+          isInline, /*isImplicitlyDeclared=*/false, ConstexprKind,
+          TrailingRequiresClause);
 
       // If the destructor needs an implicit exception specification, set it
       // now. FIXME: It'd be nice to be able to create the right type to start
@@ -8313,7 +8322,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
       return FunctionDecl::Create(SemaRef.Context, DC, D.getBeginLoc(),
                                   D.getIdentifierLoc(), Name, R, TInfo, SC,
                                   isInline,
-                                  /*hasPrototype=*/true, ConstexprKind);
+                                  /*hasPrototype=*/true, ConstexprKind,
+                                  TrailingRequiresClause);
     }
 
   } else if (Name.getNameKind() == DeclarationName::CXXConversionFunctionName) {
@@ -8330,9 +8340,14 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
     IsVirtualOkay = true;
     return CXXConversionDecl::Create(
         SemaRef.Context, cast<CXXRecordDecl>(DC), D.getBeginLoc(), NameInfo, R,
-        TInfo, isInline, ExplicitSpecifier, ConstexprKind, SourceLocation());
+        TInfo, isInline, ExplicitSpecifier, ConstexprKind, SourceLocation(),
+        TrailingRequiresClause);
 
   } else if (Name.getNameKind() == DeclarationName::CXXDeductionGuideName) {
+    if (TrailingRequiresClause)
+      SemaRef.Diag(TrailingRequiresClause->getBeginLoc(),
+                   diag::err_trailing_requires_clause_on_deduction_guide)
+          << TrailingRequiresClause->getSourceRange();
     SemaRef.CheckDeductionGuideDeclarator(D, R, SC);
 
     return CXXDeductionGuideDecl::Create(SemaRef.Context, DC, D.getBeginLoc(),
@@ -8354,7 +8369,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
     // This is a C++ method declaration.
     CXXMethodDecl *Ret = CXXMethodDecl::Create(
         SemaRef.Context, cast<CXXRecordDecl>(DC), D.getBeginLoc(), NameInfo, R,
-        TInfo, SC, isInline, ConstexprKind, SourceLocation());
+        TInfo, SC, isInline, ConstexprKind, SourceLocation(),
+        TrailingRequiresClause);
     IsVirtualOkay = !Ret->isStatic();
     return Ret;
   } else {
@@ -8368,7 +8384,7 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
     //   - we're in C++ (where every function has a prototype),
     return FunctionDecl::Create(SemaRef.Context, DC, D.getBeginLoc(), NameInfo,
                                 R, TInfo, SC, isInline, true /*HasPrototype*/,
-                                ConstexprKind);
+                                ConstexprKind, TrailingRequiresClause);
   }
 }
 
@@ -10575,6 +10591,11 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
           }
         }
       }
+      if (Method->isVirtual() && NewFD->getTrailingRequiresClause())
+        // C++2a [class.virtual]p6
+        // A virtual method shall not have a requires-clause.
+        Diag(NewFD->getTrailingRequiresClause()->getBeginLoc(),
+             diag::err_constrained_virtual_method);
 
       if (Method->isStatic())
         checkThisInStaticMemberFunctionType(Method);
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index de3d44eb1f0f8..1b156bdfc9a4f 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8199,7 +8199,8 @@ NamedDecl * Sema::DeclClonePragmaWeak(NamedDecl *ND, IdentifierInfo *II,
     NewFD = FunctionDecl::Create(
         FD->getASTContext(), FD->getDeclContext(), Loc, Loc,
         DeclarationName(II), FD->getType(), FD->getTypeSourceInfo(), SC_None,
-        false /*isInlineSpecified*/, FD->hasPrototype(), CSK_unspecified);
+        false /*isInlineSpecified*/, FD->hasPrototype(), CSK_unspecified,
+        FD->getTrailingRequiresClause());
     NewD = NewFD;
 
     if (FD->getQualifier())
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 6321a28bf25cb..34137657a919e 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -3868,6 +3868,26 @@ void Sema::ActOnStartCXXInClassMemberInitializer() {
   PushFunctionScope();
 }
 
+void Sema::ActOnStartTrailingRequiresClause(Scope *S, Declarator &D) {
+  if (!D.isFunctionDeclarator())
+    return;
+  auto &FTI = D.getFunctionTypeInfo();
+  if (!FTI.Params)
+    return;
+  for (auto &Param : ArrayRef<DeclaratorChunk::ParamInfo>(FTI.Params,
+                                                          FTI.NumParams)) {
+    auto *ParamDecl = cast<NamedDecl>(Param.Param);
+    if (ParamDecl->getDeclName())
+      PushOnScopeChains(ParamDecl, S, /*AddToContext=*/false);
+  }
+}
+
+ExprResult Sema::ActOnFinishTrailingRequiresClause(ExprResult ConstraintExpr) {
+  if (ConstraintExpr.isInvalid())
+    return ExprError();
+  return CorrectDelayedTyposInExpr(ConstraintExpr);
+}
+
 /// This is invoked after parsing an in-class initializer for a
 /// non-static C++ class member, and after instantiating an in-class initializer
 /// in a class template. Such actions are deferred until the class is complete.
@@ -12702,7 +12722,8 @@ Sema::findInheritingConstructor(SourceLocation Loc,
       BaseCtor->getExplicitSpecifier(), /*isInline=*/true,
       /*isImplicitlyDeclared=*/true,
       Constexpr ? BaseCtor->getConstexprKind() : CSK_unspecified,
-      InheritedConstructor(Shadow, BaseCtor));
+      InheritedConstructor(Shadow, BaseCtor),
+      BaseCtor->getTrailingRequiresClause());
   if (Shadow->isInvalidDecl())
     DerivedCtor->setInvalidDecl();
 
@@ -17094,6 +17115,11 @@ bool Sema::checkThisInStaticMemberFunctionType(CXXMethodDecl *Method) {
   if (checkThisInStaticMemberFunctionExceptionSpec(Method))
     return true;
 
+  // Check the trailing requires clause
+  if (Expr *E = Method->getTrailingRequiresClause())
+    if (!Finder.TraverseStmt(E))
+      return true;
+
   return checkThisInStaticMemberFunctionAttributes(Method);
 }
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 1b563406552cd..ac2a9e1cff08e 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -329,6 +329,30 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
 
   diagnoseUseOfInternalDeclInInlineFunction(*this, D, Loc);
 
+  // [expr.prim.id]p4
+  //   A program that refers explicitly or implicitly to a function with a
+  //   trailing requires-clause whose constraint-expression is not satisfied,
+  //   other than to declare it, is ill-formed. [...]
+  //
+  // See if this is a function with constraints that need to be satisfied.
+  if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
+    if (Expr *RC = FD->getTrailingRequiresClause()) {
+      ConstraintSatisfaction Satisfaction;
+      bool Failed = CheckConstraintSatisfaction(RC, Satisfaction);
+      if (Failed)
+        // A diagnostic will have already been generated (non-constant
+        // constraint expression, for example)
+        return true;
+      if (!Satisfaction.IsSatisfied) {
+        Diag(Loc,
+             diag::err_reference_to_function_with_unsatisfied_constraints)
+            << D;
+        DiagnoseUnsatisfiedConstraint(Satisfaction);
+        return true;
+      }
+    }
+  }
+
   return false;
 }
 
@@ -18051,7 +18075,7 @@ ExprResult Sema::CheckPlaceholderExpr(Expr *E) {
     // No guarantees that ResolveAndFixSingleFunctionTemplateSpecialization
     // leaves Result unchanged on failure.
     Result = E;
-    if (resolveAndFixAddressOfOnlyViableOverloadCandidate(Result))
+    if (resolveAndFixAddressOfSingleOverloadCandidate(Result))
       return Result;
 
     // If that failed, try to recover with a call.
@@ -18188,3 +18212,8 @@ ExprResult Sema::ActOnObjCAvailabilityCheckExpr(
   return new (Context)
       ObjCAvailabilityCheckExpr(Version, AtLoc, RParen, Context.BoolTy);
 }
+
+bool Sema::IsDependentFunctionNameExpr(Expr *E) {
+  assert(E->isTypeDependent());
+  return isa<UnresolvedLookupExpr>(E);
+}
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index d09a3377d2b03..c2d14a44f53d4 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -361,7 +361,8 @@ CXXMethodDecl *Sema::startLambdaDefinition(CXXRecordDecl *Class,
                                            TypeSourceInfo *MethodTypeInfo,
                                            SourceLocation EndLoc,
                                            ArrayRef<ParmVarDecl *> Params,
-                                           ConstexprSpecKind ConstexprKind) {
+                                           ConstexprSpecKind ConstexprKind,
+                                           Expr *TrailingRequiresClause) {
   QualType MethodType = MethodTypeInfo->getType();
   TemplateParameterList *TemplateParams =
       getGenericLambdaTemplateParameterList(getCurLambda(), *this);
@@ -395,7 +396,7 @@ CXXMethodDecl *Sema::startLambdaDefinition(CXXRecordDecl *Class,
       DeclarationNameInfo(MethodName, IntroducerRange.getBegin(),
                           MethodNameLoc),
       MethodType, MethodTypeInfo, SC_None,
-      /*isInline=*/true, ConstexprKind, EndLoc);
+      /*isInline=*/true, ConstexprKind, EndLoc, TrailingRequiresClause);
   Method->setAccess(AS_public);
   if (!TemplateParams)
     Class->addDecl(Method);
@@ -972,7 +973,8 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro,
                                                  KnownDependent, Intro.Default);
   CXXMethodDecl *Method =
       startLambdaDefinition(Class, Intro.Range, MethodTyInfo, EndLoc, Params,
-                            ParamInfo.getDeclSpec().getConstexprSpecifier());
+                            ParamInfo.getDeclSpec().getConstexprSpecifier(),
+                            ParamInfo.getTrailingRequiresClause());
   if (ExplicitParams)
     CheckCXXDefaultArguments(Method);
 
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 1f77f2b9342cf..5fb59b545176d 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1137,7 +1137,8 @@ Sema::CheckOverload(Scope *S, FunctionDecl *New, const LookupResult &Old,
 }
 
 bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
-                      bool UseMemberUsingDeclRules, bool ConsiderCudaAttrs) {
+                      bool UseMemberUsingDeclRules, bool ConsiderCudaAttrs,
+                      bool ConsiderRequiresClauses) {
   // C++ [basic.start.main]p2: This function shall not be overloaded.
   if (New->isMain())
     return false;
@@ -1273,23 +1274,38 @@ bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
   if (getLangOpts().CUDA && ConsiderCudaAttrs) {
     // Don't allow overloading of destructors.  (In theory we could, but it
     // would be a giant change to clang.)
-    if (isa<CXXDestructorDecl>(New))
-      return false;
-
-    CUDAFunctionTarget NewTarget = IdentifyCUDATarget(New),
-                       OldTarget = IdentifyCUDATarget(Old);
-    if (NewTarget == CFT_InvalidTarget)
-      return false;
+    if (!isa<CXXDestructorDecl>(New)) {
+      CUDAFunctionTarget NewTarget = IdentifyCUDATarget(New),
+                         OldTarget = IdentifyCUDATarget(Old);
+      if (NewTarget != CFT_InvalidTarget) {
+        assert((OldTarget != CFT_InvalidTarget) &&
+               "Unexpected invalid target.");
+
+        // Allow overloading of functions with same signature and different CUDA
+        // target attributes.
+        if (NewTarget != OldTarget)
+          return true;
+      }
+    }
+  }
 
-    assert((OldTarget != CFT_InvalidTarget) && "Unexpected invalid target.");
+  if (ConsiderRequiresClauses) {
+    Expr *NewRC = New->getTrailingRequiresClause(),
+         *OldRC = Old->getTrailingRequiresClause();
+    if ((NewRC != nullptr) != (OldRC != nullptr))
+      // RC are most certainly different - these are overloads.
+      return true;
 
-    // Allow overloading of functions with same signature and different CUDA
-    // target attributes.
-    return NewTarget != OldTarget;
+    if (NewRC) {
+      llvm::FoldingSetNodeID NewID, OldID;
+      NewRC->Profile(NewID, Context, /*Canonical=*/true);
+      OldRC->Profile(OldID, Context, /*Canonical=*/true);
+      if (NewID != OldID)
+        // RCs are not equivalent - these are overloads.
+        return true;
+    }
   }
 
-  // TODO: Concepts: Check function trailing requires clauses here.
-
   // The signatures match; this is not an overload.
   return false;
 }
@@ -6258,6 +6274,16 @@ void Sema::AddOverloadCandidate(
         return;
       }
 
+  if (Expr *RequiresClause = Function->getTrailingRequiresClause()) {
+    ConstraintSatisfaction Satisfaction;
+    if (CheckConstraintSatisfaction(RequiresClause, Satisfaction) ||
+        !Satisfaction.IsSatisfied) {
+      Candidate.Viable = false;
+      Candidate.FailureKind = ovl_fail_constraints_not_satisfied;
+      return;
+    }
+  }
+
   // Determine the implicit conversion sequences for each of the
   // arguments.
   for (unsigned ArgIdx = 0; ArgIdx < Args.size(); ++ArgIdx) {
@@ -6774,6 +6800,16 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
         return;
       }
 
+  if (Expr *RequiresClause = Method->getTrailingRequiresClause()) {
+    ConstraintSatisfaction Satisfaction;
+    if (CheckConstraintSatisfaction(RequiresClause, Satisfaction) ||
+        !Satisfaction.IsSatisfied) {
+      Candidate.Viable = false;
+      Candidate.FailureKind = ovl_fail_constraints_not_satisfied;
+      return;
+    }
+  }
+
   // Determine the implicit conversion sequences for each of the
   // arguments.
   for (unsigned ArgIdx = 0; ArgIdx < Args.size(); ++ArgIdx) {
@@ -7130,6 +7166,17 @@ void Sema::AddConversionCandidate(
     return;
   }
 
+  Expr *RequiresClause = Conversion->getTrailingRequiresClause();
+  if (RequiresClause) {
+    ConstraintSatisfaction Satisfaction;
+    if (CheckConstraintSatisfaction(RequiresClause, Satisfaction) ||
+        !Satisfaction.IsSatisfied) {
+      Candidate.Viable = false;
+      Candidate.FailureKind = ovl_fail_constraints_not_satisfied;
+      return;
+    }
+  }
+
   // We won't go through a user-defined type conversion function to convert a
   // derived to base as such conversions are given Conversion Rank. They only
   // go through a copy constructor. 13.3.3.1.2-p4 [over.ics.user]
@@ -9461,6 +9508,35 @@ bool clang::isBetterOverloadCandidate(
       return BetterTemplate == Cand1.Function->getPrimaryTemplate();
   }
 
+  //   -— F1 and F2 are non-template functions with the same
+  //      parameter-type-lists, and F1 is more constrained than F2 [...],
+  if (Cand1.Function && Cand2.Function && !Cand1IsSpecialization &&
+      !Cand2IsSpecialization && Cand1.Function->hasPrototype() &&
+      Cand2.Function->hasPrototype()) {
+    auto *PT1 = cast<FunctionProtoType>(Cand1.Function->getFunctionType());
+    auto *PT2 = cast<FunctionProtoType>(Cand2.Function->getFunctionType());
+    if (PT1->getNumParams() == PT2->getNumParams() &&
+        PT1->isVariadic() == PT2->isVariadic() &&
+        S.FunctionParamTypesAreEqual(PT1, PT2)) {
+      Expr *RC1 = Cand1.Function->getTrailingRequiresClause();
+      Expr *RC2 = Cand2.Function->getTrailingRequiresClause();
+      if (RC1 && RC2) {
+        bool AtLeastAsConstrained1, AtLeastAsConstrained2;
+        if (S.IsAtLeastAsConstrained(Cand1.Function, {RC1}, Cand2.Function,
+                                     {RC2}, AtLeastAsConstrained1))
+          return false;
+        if (!AtLeastAsConstrained1)
+          return false;
+        if (S.IsAtLeastAsConstrained(Cand2.Function, {RC2}, Cand1.Function,
+                                     {RC1}, AtLeastAsConstrained2))
+          return false;
+        if (!AtLeastAsConstrained2)
+          return true;
+      } else if (RC1 || RC2)
+        return RC1 != nullptr;
+    }
+  }
+
   //   -- F1 is a constructor for a class D, F2 is a constructor for a base
   //      class B of D, and for all arguments the corresponding parameters of
   //      F1 and F2 have the same type.
@@ -9829,6 +9905,24 @@ static bool checkAddressOfFunctionIsAvailable(Sema &S, const FunctionDecl *FD,
     return false;
   }
 
+  if (const Expr *RC = FD->getTrailingRequiresClause()) {
+    ConstraintSatisfaction Satisfaction;
+    if (S.CheckConstraintSatisfaction(RC, Satisfaction))
+      return false;
+    if (!Satisfaction.IsSatisfied) {
+      if (Complain) {
+        if (InOverloadResolution)
+          S.Diag(FD->getBeginLoc(),
+                 diag::note_ovl_candidate_unsatisfied_constraints);
+        else
+          S.Diag(Loc, diag::err_addrof_function_constraints_not_satisfied)
+              << FD;
+        S.DiagnoseUnsatisfiedConstraint(Satisfaction);
+      }
+      return false;
+    }
+  }
+
   auto I = llvm::find_if(FD->parameters(), [](const ParmVarDecl *P) {
     return P->hasAttr<PassObjectSizeAttr>();
   });
@@ -9886,6 +9980,55 @@ void Sema::NoteOverloadCandidate(NamedDecl *Found, FunctionDecl *Fn,
   MaybeEmitInheritedConstructorNote(*this, Found);
 }
 
+static void
+MaybeDiagnoseAmbiguousConstraints(Sema &S, ArrayRef<OverloadCandidate> Cands) {
+  // Perhaps the ambiguity was caused by two atomic constraints that are
+  // 'identical' but not equivalent:
+  //
+  // void foo() requires (sizeof(T) > 4) { } // #1
+  // void foo() requires (sizeof(T) > 4) && T::value { } // #2
+  //
+  // The 'sizeof(T) > 4' constraints are seemingly equivalent and should cause
+  // #2 to subsume #1, but these constraint are not considered equivalent
+  // according to the subsumption rules because they are not the same
+  // source-level construct. This behavior is quite confusing and we should try
+  // to help the user figure out what happened.
+
+  SmallVector<const Expr *, 3> FirstAC, SecondAC;
+  FunctionDecl *FirstCand = nullptr, *SecondCand = nullptr;
+  for (auto I = Cands.begin(), E = Cands.end(); I != E; ++I) {
+    if (!I->Function)
+      continue;
+    SmallVector<const Expr *, 3> AC;
+    if (auto *Template = I->Function->getPrimaryTemplate())
+      Template->getAssociatedConstraints(AC);
+    else
+      I->Function->getAssociatedConstraints(AC);
+    if (AC.empty())
+      continue;
+    if (FirstCand == nullptr) {
+      FirstCand = I->Function;
+      FirstAC = AC;
+    } else if (SecondCand == nullptr) {
+      SecondCand = I->Function;
+      SecondAC = AC;
+    } else {
+      // We have more than one pair of constrained functions - this check is
+      // expensive and we'd rather not try to diagnose it.
+      return;
+    }
+  }
+  if (!SecondCand)
+    return;
+  // The diagnostic can only happen if there are associated constraints on
+  // both sides (there needs to be some identical atomic constraint).
+  if (S.MaybeEmitAmbiguousAtomicConstraintsDiagnostic(FirstCand, FirstAC,
+                                                      SecondCand, SecondAC))
+    // Just show the user one diagnostic, they'll probably figure it out
+    // from here.
+    return;
+}
+
 // Notes the location of all overload candidates designated through
 // OverloadedExpr
 void Sema::NoteAllOverloadCandidates(Expr *OverloadedExpr, QualType DestType,
@@ -10771,6 +10914,23 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
   case ovl_non_default_multiversion_function:
     // Do nothing, these should simply be ignored.
     break;
+
+  case ovl_fail_constraints_not_satisfied: {
+    std::string FnDesc;
+    std::pair<OverloadCandidateKind, OverloadCandidateSelect> FnKindPair =
+        ClassifyOverloadCandidate(S, Cand->FoundDecl, Fn,
+                                  Cand->getRewriteKind(), FnDesc);
+
+    S.Diag(Fn->getLocation(),
+           diag::note_ovl_candidate_constraints_not_satisfied)
+        << (unsigned)FnKindPair.first << (unsigned)ocs_non_template
+        << FnDesc /* Ignored */;
+    ConstraintSatisfaction Satisfaction;
+    if (S.CheckConstraintSatisfaction(Fn->getTrailingRequiresClause(),
+                                      Satisfaction))
+      break;
+    S.DiagnoseUnsatisfiedConstraint(Satisfaction);
+  }
   }
 }
 
@@ -11156,6 +11316,9 @@ void OverloadCandidateSet::NoteCandidates(PartialDiagnosticAt PD,
   S.Diag(PD.first, PD.second);
 
   NoteCandidates(S, Args, Cands, Opc, OpLoc);
+
+  if (OCD == OCD_AmbiguousCandidates)
+    MaybeDiagnoseAmbiguousConstraints(S, {begin(), end()});
 }
 
 void OverloadCandidateSet::NoteCandidates(Sema &S, ArrayRef<Expr *> Args,
@@ -11804,15 +11967,33 @@ Sema::ResolveAddressOfOverloadedFunction(Expr *AddressOfExpr,
 /// resolve that function to a single function that can have its address taken.
 /// This will modify `Pair` iff it returns non-null.
 ///
-/// This routine can only realistically succeed if all but one candidates in the
-/// overload set for SrcExpr cannot have their addresses taken.
+/// This routine can only succeed if from all of the candidates in the overload
+/// set for SrcExpr that can have their addresses taken, there is one candidate
+/// that is more constrained than the rest.
 FunctionDecl *
-Sema::resolveAddressOfOnlyViableOverloadCandidate(Expr *E,
-                                                  DeclAccessPair &Pair) {
+Sema::resolveAddressOfSingleOverloadCandidate(Expr *E, DeclAccessPair &Pair) {
   OverloadExpr::FindResult R = OverloadExpr::find(E);
   OverloadExpr *Ovl = R.Expression;
+  bool IsResultAmbiguous = false;
   FunctionDecl *Result = nullptr;
   DeclAccessPair DAP;
+  SmallVector<FunctionDecl *, 2> AmbiguousDecls;
+
+  auto CheckMoreConstrained =
+      [&] (FunctionDecl *FD1, FunctionDecl *FD2) -> Optional<bool> {
+        SmallVector<const Expr *, 1> AC1, AC2;
+        FD1->getAssociatedConstraints(AC1);
+        FD2->getAssociatedConstraints(AC2);
+        bool AtLeastAsConstrained1, AtLeastAsConstrained2;
+        if (IsAtLeastAsConstrained(FD1, AC1, FD2, AC2, AtLeastAsConstrained1))
+          return None;
+        if (IsAtLeastAsConstrained(FD2, AC2, FD1, AC1, AtLeastAsConstrained2))
+          return None;
+        if (AtLeastAsConstrained1 == AtLeastAsConstrained2)
+          return None;
+        return AtLeastAsConstrained1;
+      };
+
   // Don't use the AddressOfResolver because we're specifically looking for
   // cases where we have one overload candidate that lacks
   // enable_if/pass_object_size/...
@@ -11824,32 +12005,54 @@ Sema::resolveAddressOfOnlyViableOverloadCandidate(Expr *E,
     if (!checkAddressOfFunctionIsAvailable(FD))
       continue;
 
-    // We have more than one result; quit.
-    if (Result)
-      return nullptr;
+    // We have more than one result - see if it is more constrained than the
+    // previous one.
+    if (Result) {
+      Optional<bool> MoreConstrainedThanPrevious = CheckMoreConstrained(FD,
+                                                                        Result);
+      if (!MoreConstrainedThanPrevious) {
+        IsResultAmbiguous = true;
+        AmbiguousDecls.push_back(FD);
+        continue;
+      }
+      if (!*MoreConstrainedThanPrevious)
+        continue;
+      // FD is more constrained - replace Result with it.
+    }
+    IsResultAmbiguous = false;
     DAP = I.getPair();
     Result = FD;
   }
 
-  if (Result)
+  if (IsResultAmbiguous)
+    return nullptr;
+
+  if (Result) {
+    SmallVector<const Expr *, 1> ResultAC;
+    // We skipped over some ambiguous declarations which might be ambiguous with
+    // the selected result.
+    for (FunctionDecl *Skipped : AmbiguousDecls)
+      if (!CheckMoreConstrained(Skipped, Result).hasValue())
+        return nullptr;
     Pair = DAP;
+  }
   return Result;
 }
 
 /// Given an overloaded function, tries to turn it into a non-overloaded
-/// function reference using resolveAddressOfOnlyViableOverloadCandidate. This
+/// function reference using resolveAddressOfSingleOverloadCandidate. This
 /// will perform access checks, diagnose the use of the resultant decl, and, if
 /// requested, potentially perform a function-to-pointer decay.
 ///
-/// Returns false if resolveAddressOfOnlyViableOverloadCandidate fails.
+/// Returns false if resolveAddressOfSingleOverloadCandidate fails.
 /// Otherwise, returns true. This may emit diagnostics and return true.
-bool Sema::resolveAndFixAddressOfOnlyViableOverloadCandidate(
+bool Sema::resolveAndFixAddressOfSingleOverloadCandidate(
     ExprResult &SrcExpr, bool DoFunctionPointerConverion) {
   Expr *E = SrcExpr.get();
   assert(E->getType() == Context.OverloadTy && "SrcExpr must be an overload");
 
   DeclAccessPair DAP;
-  FunctionDecl *Found = resolveAddressOfOnlyViableOverloadCandidate(E, DAP);
+  FunctionDecl *Found = resolveAddressOfSingleOverloadCandidate(E, DAP);
   if (!Found || Found->isCPUDispatchMultiVersion() ||
       Found->isCPUSpecificMultiVersion())
     return false;
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index ade8a5a6ac148..69aabcd7d6345 100755
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -3750,6 +3750,11 @@ static void checkMoreSpecializedThanPrimary(Sema &S, PartialSpecDecl *Partial) {
   }
 
   S.Diag(Template->getLocation(), diag::note_template_decl_here);
+  SmallVector<const Expr *, 3> PartialAC, TemplateAC;
+  Template->getAssociatedConstraints(TemplateAC);
+  Partial->getAssociatedConstraints(PartialAC);
+  S.MaybeEmitAmbiguousAtomicConstraintsDiagnostic(Partial, PartialAC, Template,
+                                                  TemplateAC);
 }
 
 static void
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 521160d1ad23e..d267ae8572e44 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3389,11 +3389,6 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
           PartialOverloading))
     return Result;
 
-  if (TemplateDeductionResult Result
-        = CheckDeducedArgumentConstraints(*this, FunctionTemplate, Builder,
-                                          Info))
-    return Result;
-
   // C++ [temp.deduct.call]p10: [DR1391]
   //   If deduction succeeds for all parameters that contain
   //   template-parameters that participate in template argument deduction,
@@ -3439,6 +3434,23 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     return TDK_SubstitutionFailure;
   }
 
+  // C++2a [temp.deduct]p5
+  //   [...] When all template arguments have been deduced [...] all uses of
+  //   template parameters [...] are replaced with the corresponding deduced
+  //   or default argument values.
+  //   [...] If the function template has associated constraints
+  //   ([temp.constr.decl]), those constraints are checked for satisfaction
+  //   ([temp.constr.constr]). If the constraints are not satisfied, type
+  //   deduction fails.
+  if (CheckInstantiatedFunctionTemplateConstraints(Info.getLocation(),
+          Specialization, Builder, Info.AssociatedConstraintsSatisfaction))
+    return TDK_MiscellaneousDeductionFailure;
+
+  if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
+    Info.reset(TemplateArgumentList::CreateCopy(Context, Builder));
+    return TDK_ConstraintsNotSatisfied;
+  }
+
   if (OriginalCallArgs) {
     // C++ [temp.deduct.call]p4:
     //   In general, the deduction process attempts to find template argument
@@ -3559,7 +3571,7 @@ ResolveOverloadForDeduction(Sema &S, TemplateParameterList *TemplateParams,
 
     DeclAccessPair DAP;
     if (FunctionDecl *Viable =
-            S.resolveAddressOfOnlyViableOverloadCandidate(Arg, DAP))
+            S.resolveAddressOfSingleOverloadCandidate(Arg, DAP))
       return GetTypeOfFunction(S, R, Viable);
 
     return {};
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 90b85db6d0883..a9018cfeccbc1 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -150,7 +150,7 @@ Sema::getTemplateInstantiationArgs(NamedDecl *D,
           break;
 
         // If this function is a generic lambda specialization, we are done.
-        if (isGenericLambdaCallOperatorSpecialization(Function))
+        if (isGenericLambdaCallOperatorOrStaticInvokerSpecialization(Function))
           break;
 
       } else if (FunctionTemplateDecl *FunTmpl
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 4d248f4a292cb..8d55a7e309ef3 100755
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -656,11 +656,10 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
       LateAttrs->push_back(LateInstantiatedAttribute(TmplAttr, Saved, New));
     } else {
       // Allow 'this' within late-parsed attributes.
-      NamedDecl *ND = dyn_cast<NamedDecl>(New);
-      CXXRecordDecl *ThisContext =
-          dyn_cast_or_null<CXXRecordDecl>(ND->getDeclContext());
+      auto *ND = cast<NamedDecl>(New);
+      auto *ThisContext = dyn_cast_or_null<CXXRecordDecl>(ND->getDeclContext());
       CXXThisScopeRAII ThisScope(*this, ThisContext, Qualifiers(),
-                                 ND && ND->isCXXInstanceMember());
+                                 ND->isCXXInstanceMember());
 
       Attr *NewAttr = sema::instantiateTemplateAttribute(TmplAttr, Context,
                                                          *this, TemplateArgs);
@@ -1847,6 +1846,18 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
       return nullptr;
   }
 
+  // FIXME: Concepts: Do not substitute into constraint expressions
+  Expr *TrailingRequiresClause = D->getTrailingRequiresClause();
+  if (TrailingRequiresClause) {
+    ExprResult SubstRC = SemaRef.SubstExpr(TrailingRequiresClause,
+                                           TemplateArgs);
+    if (SubstRC.isInvalid())
+      return nullptr;
+    TrailingRequiresClause = SubstRC.get();
+    if (!SemaRef.CheckConstraintExpression(TrailingRequiresClause))
+      return nullptr;
+  }
+
   // If we're instantiating a local function declaration, put the result
   // in the enclosing namespace; otherwise we need to find the instantiated
   // context.
@@ -1883,7 +1894,8 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
     Function = FunctionDecl::Create(
         SemaRef.Context, DC, D->getInnerLocStart(), NameInfo, T, TInfo,
         D->getCanonicalDecl()->getStorageClass(), D->isInlineSpecified(),
-        D->hasWrittenPrototype(), D->getConstexprKind());
+        D->hasWrittenPrototype(), D->getConstexprKind(),
+        TrailingRequiresClause);
     Function->setRangeEnd(D->getSourceRange().getEnd());
   }
 
@@ -1910,6 +1922,9 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
       Params[P]->setOwningFunction(Function);
   Function->setParams(Params);
 
+  if (TrailingRequiresClause)
+    Function->setTrailingRequiresClause(TrailingRequiresClause);
+
   if (TemplateParams) {
     // Our resulting instantiation is actually a function template, since we
     // are substituting only the outer template parameters. For example, given
@@ -2169,6 +2184,18 @@ Decl *TemplateDeclInstantiator::VisitCXXMethodDecl(
       return nullptr;
   }
 
+  // FIXME: Concepts: Do not substitute into constraint expressions
+  Expr *TrailingRequiresClause = D->getTrailingRequiresClause();
+  if (TrailingRequiresClause) {
+    ExprResult SubstRC = SemaRef.SubstExpr(TrailingRequiresClause,
+                                           TemplateArgs);
+    if (SubstRC.isInvalid())
+      return nullptr;
+    TrailingRequiresClause = SubstRC.get();
+    if (!SemaRef.CheckConstraintExpression(TrailingRequiresClause))
+      return nullptr;
+  }
+
   DeclContext *DC = Owner;
   if (isFriend) {
     if (QualifierLoc) {
@@ -2201,23 +2228,27 @@ Decl *TemplateDeclInstantiator::VisitCXXMethodDecl(
     Method = CXXConstructorDecl::Create(
         SemaRef.Context, Record, StartLoc, NameInfo, T, TInfo,
         InstantiatedExplicitSpecifier, Constructor->isInlineSpecified(), false,
-        Constructor->getConstexprKind());
+        Constructor->getConstexprKind(), InheritedConstructor(),
+        TrailingRequiresClause);
     Method->setRangeEnd(Constructor->getEndLoc());
   } else if (CXXDestructorDecl *Destructor = dyn_cast<CXXDestructorDecl>(D)) {
     Method = CXXDestructorDecl::Create(
         SemaRef.Context, Record, StartLoc, NameInfo, T, TInfo,
-        Destructor->isInlineSpecified(), false, Destructor->getConstexprKind());
+        Destructor->isInlineSpecified(), false, Destructor->getConstexprKind(),
+        TrailingRequiresClause);
     Method->setRangeEnd(Destructor->getEndLoc());
   } else if (CXXConversionDecl *Conversion = dyn_cast<CXXConversionDecl>(D)) {
     Method = CXXConversionDecl::Create(
         SemaRef.Context, Record, StartLoc, NameInfo, T, TInfo,
         Conversion->isInlineSpecified(), InstantiatedExplicitSpecifier,
-        Conversion->getConstexprKind(), Conversion->getEndLoc());
+        Conversion->getConstexprKind(), Conversion->getEndLoc(),
+        TrailingRequiresClause);
   } else {
     StorageClass SC = D->isStatic() ? SC_Static : SC_None;
     Method = CXXMethodDecl::Create(SemaRef.Context, Record, StartLoc, NameInfo,
                                    T, TInfo, SC, D->isInlineSpecified(),
-                                   D->getConstexprKind(), D->getEndLoc());
+                                   D->getConstexprKind(), D->getEndLoc(),
+                                   TrailingRequiresClause);
   }
 
   if (D->isInlined())
@@ -4119,6 +4150,48 @@ void Sema::InstantiateExceptionSpec(SourceLocation PointOfInstantiation,
                      TemplateArgs);
 }
 
+bool Sema::CheckInstantiatedFunctionTemplateConstraints(
+    SourceLocation PointOfInstantiation, FunctionDecl *Decl,
+    ArrayRef<TemplateArgument> TemplateArgs,
+    ConstraintSatisfaction &Satisfaction) {
+  // In most cases we're not going to have constraints, so check for that first.
+  FunctionTemplateDecl *Template = Decl->getPrimaryTemplate();
+  // Note - code synthesis context for the constraints check is created
+  // inside CheckConstraintsSatisfaction.
+  SmallVector<const Expr *, 3> TemplateAC;
+  Template->getAssociatedConstraints(TemplateAC);
+  if (TemplateAC.empty()) {
+    Satisfaction.IsSatisfied = true;
+    return false;
+  }
+
+  // Enter the scope of this instantiation. We don't use
+  // PushDeclContext because we don't have a scope.
+  Sema::ContextRAII savedContext(*this, Decl);
+  LocalInstantiationScope Scope(*this);
+
+  MultiLevelTemplateArgumentList MLTAL =
+    getTemplateInstantiationArgs(Decl, nullptr, /*RelativeToPrimary*/true);
+
+  // If this is not an explicit specialization - we need to get the instantiated
+  // version of the template arguments and add them to scope for the
+  // substitution.
+  if (Decl->isTemplateInstantiation()) {
+    InstantiatingTemplate Inst(*this, Decl->getPointOfInstantiation(),
+        InstantiatingTemplate::ConstraintsCheck{}, Decl->getPrimaryTemplate(),
+        MLTAL.getInnermost(), SourceRange());
+    if (Inst.isInvalid())
+      return true;
+    if (addInstantiatedParametersToScope(*this, Decl,
+                                        Decl->getTemplateInstantiationPattern(),
+                                         Scope, MLTAL))
+      return true;
+  }
+
+  return CheckConstraintSatisfaction(Template, TemplateAC, TemplateArgs,
+                                     PointOfInstantiation, Satisfaction);
+}
+
 /// Initializes the common fields of an instantiation function
 /// declaration (New) from the corresponding fields of its template (Tmpl).
 ///
diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
index 975d6620c06f8..d947d6d282be0 100644
--- a/clang/lib/Sema/SemaTemplateVariadic.cpp
+++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
@@ -937,6 +937,10 @@ bool Sema::containsUnexpandedParameterPacks(Declarator &D) {
     }
   }
 
+  if (Expr *TRC = D.getTrailingRequiresClause())
+    if (TRC->containsUnexpandedParameterPack())
+      return true;
+  
   return false;
 }
 
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index e99af17a400ae..6107892b3769c 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -11559,6 +11559,13 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
                                                         NewCallOpType);
   }
 
+  // Transform the trailing requires clause
+  ExprResult NewTrailingRequiresClause;
+  if (Expr *TRC = E->getCallOperator()->getTrailingRequiresClause())
+    // FIXME: Concepts: Substitution into requires clause should only happen
+    //                  when checking satisfaction.
+    NewTrailingRequiresClause = getDerived().TransformExpr(TRC);
+
   // Create the local class that will describe the lambda.
   CXXRecordDecl *OldClass = E->getLambdaClass();
   CXXRecordDecl *Class
@@ -11579,7 +11586,8 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
       Class, E->getIntroducerRange(), NewCallOpTSI,
       E->getCallOperator()->getEndLoc(),
       NewCallOpTSI->getTypeLoc().castAs<FunctionProtoTypeLoc>().getParams(),
-      E->getCallOperator()->getConstexprKind());
+      E->getCallOperator()->getConstexprKind(),
+      NewTrailingRequiresClause.get());
 
   LSI->CallOperator = NewCallOperator;
 
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 3351f76151e35..a132164d30e75 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -822,6 +822,7 @@ void ASTDeclReader::VisitDeclaratorDecl(DeclaratorDecl *DD) {
   if (Record.readInt()) { // hasExtInfo
     auto *Info = new (Reader.getContext()) DeclaratorDecl::ExtInfo();
     Record.readQualifierInfo(*Info);
+    Info->TrailingRequiresClause = Record.readExpr();
     DD->DeclInfo = Info;
   }
   QualType TSIType = Record.readType();
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 66f4db855a3e9..a553936570b59 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -523,8 +523,11 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) {
   VisitValueDecl(D);
   Record.AddSourceLocation(D->getInnerLocStart());
   Record.push_back(D->hasExtInfo());
-  if (D->hasExtInfo())
-    Record.AddQualifierInfo(*D->getExtInfo());
+  if (D->hasExtInfo()) {
+    DeclaratorDecl::ExtInfo *Info = D->getExtInfo();
+    Record.AddQualifierInfo(*Info);
+    Record.AddStmt(Info->TrailingRequiresClause);
+  }
   // The location information is deferred until the end of the record.
   Record.AddTypeRef(D->getTypeSourceInfo() ? D->getTypeSourceInfo()->getType()
                                            : QualType());
diff --git a/clang/test/Analysis/bstring.c b/clang/test/Analysis/bstring.c
index 2d53402a9ad36..214f6537e10ed 100644
--- a/clang/test/Analysis/bstring.c
+++ b/clang/test/Analysis/bstring.c
@@ -222,6 +222,9 @@ void mempcpy2 () {
   char dst[1];
 
   mempcpy(dst, src, 4); // expected-warning{{Memory copy function overflows destination buffer}}
+#ifndef VARIANT
+// expected-warning@-2{{'mempcpy' will always overflow; destination buffer has size 1, but size argument is 4}}
+#endif
 }
 
 void mempcpy3 () {
@@ -243,6 +246,9 @@ void mempcpy5() {
   char dst[3];
 
   mempcpy(dst+2, src+2, 2); // expected-warning{{Memory copy function overflows destination buffer}}
+#ifndef VARIANT
+// expected-warning@-2{{'mempcpy' will always overflow; destination buffer has size 1, but size argument is 2}}
+#endif
 }
 
 void mempcpy6() {
diff --git a/clang/test/CXX/class.derived/class.virtual/p6.cpp b/clang/test/CXX/class.derived/class.virtual/p6.cpp
new file mode 100644
index 0000000000000..63a4313de5541
--- /dev/null
+++ b/clang/test/CXX/class.derived/class.virtual/p6.cpp
@@ -0,0 +1,21 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+template<typename T>
+class A {
+  virtual void f1() requires (sizeof(T) == 0);
+  // expected-error@-1{{virtual function cannot have a requires clause}}
+  virtual void f2() requires (sizeof(T) == 1);
+  // expected-error@-1{{virtual function cannot have a requires clause}}
+};
+
+template<typename T>
+class B : A<T> {
+  virtual void f1() requires (sizeof(T) == 0) override {}
+  // expected-error@-1{{virtual function cannot have a requires clause}}
+};
+
+template<typename T> struct C : T {void f() requires true; };
+// expected-error@-1{{virtual function cannot have a requires clause}}
+struct D { virtual void f(); };
+template struct C<D>;
+// expected-note@-1{{in instantiation of template class 'C<D>' requested here}}
\ No newline at end of file
diff --git a/clang/test/CXX/dcl/dcl.decl/p3.cpp b/clang/test/CXX/dcl/dcl.decl/p3.cpp
new file mode 100644
index 0000000000000..eec0aa2043a2e
--- /dev/null
+++ b/clang/test/CXX/dcl/dcl.decl/p3.cpp
@@ -0,0 +1,35 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+template<typename T, typename U>
+constexpr bool is_same_v = false;
+
+template<typename T>
+constexpr bool is_same_v<T, T> = true;
+
+void f1(int a) requires true; // OK
+auto f2(int a) -> bool requires true; // OK
+auto f3(int a) -> bool (*)(int b) requires true; // OK
+auto f4(int a) requires true -> bool; // expected-error{{trailing return type must appear before trailing requires clause}}
+int f5(int a) requires; // expected-error{{expected expression}}
+int f6(int a) requires {} // expected-error{{expected expression}}
+void (f7()) requires true;
+void (f8() requires true); // expected-error{{trailing requires clause should be placed outside parentheses}}
+void (*(f9 requires (true)))(); // expected-error{{trailing requires clause should be placed outside parentheses}}
+static_assert(is_same_v<decltype(f9), void (*)()>);
+void (*pf)() requires true; // expected-error{{trailing requires clause can only be used when declaring a function}}
+void g1(int (*dsdads)() requires false); // expected-error{{trailing requires clause can only be used when declaring a function}}
+void g2(int (*(*dsdads)())() requires true); // expected-error{{trailing requires clause can only be used when declaring a function}}
+void g3(int (*(*dsdads)(int) requires true)() ); // expected-error{{trailing requires clause should be placed outside parentheses}}
+using T = void ();
+T x requires true;
+struct S {
+  T m1 requires true, m2 requires true;
+};
+
+template<typename T>
+struct R {
+    R(T t);
+};
+
+template<typename T>
+R(T) -> R<T> requires true; // expected-error{{deduction guide cannot have a requires clause}}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.id/mixed-constraints.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.id/mixed-constraints.cpp
new file mode 100644
index 0000000000000..fafb3f7b35d9f
--- /dev/null
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.id/mixed-constraints.cpp
@@ -0,0 +1,18 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+template<typename T> requires (sizeof(T) >= 4 && sizeof(T) <= 10)
+// expected-note@-1{{because 'sizeof(char [20]) <= 10' (20 <= 10) evaluated to false}}
+// expected-note@-2{{because 'sizeof(char) >= 4' (1 >= 4) evaluated to false}}
+void foo() requires (sizeof(T) <= 8) {}
+// expected-note@-1{{candidate template ignored: constraints not satisfied [with T = char]}}
+// expected-note@-2{{candidate template ignored: constraints not satisfied [with T = char [9]]}}
+// expected-note@-3{{candidate template ignored: constraints not satisfied [with T = char [20]]}}
+// expected-note@-4{{because 'sizeof(char [9]) <= 8' (9 <= 8) evaluated to false}}
+
+void bar() {
+  foo<char>(); // expected-error{{no matching function for call to 'foo'}}
+  foo<int>();
+  foo<unsigned long long int>();
+  foo<char[9]>(); // expected-error{{no matching function for call to 'foo'}}
+  foo<char[20]>(); // expected-error{{no matching function for call to 'foo'}}
+}
\ No newline at end of file
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.id/p4.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.id/p4.cpp
new file mode 100644
index 0000000000000..f13ab279da33a
--- /dev/null
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.id/p4.cpp
@@ -0,0 +1,58 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+namespace functions
+{
+  void foo(int) requires false {}
+  // expected-note@-1 3{{because 'false' evaluated to false}}
+  // expected-note@-2 {{candidate function not viable: constraints not satisfied}}
+  void bar(int) requires true {}
+
+  void a(int);
+  void a(double);
+
+  void baz() {
+    foo(1); // expected-error{{no matching function for call to 'foo'}}
+    bar(1);
+    void (*p1)(int) = foo; // expected-error{{invalid reference to function 'foo': constraints not satisfied}}
+    void (*p3)(int) = bar;
+    decltype(foo)* a1 = nullptr; // expected-error{{invalid reference to function 'foo': constraints not satisfied}}
+    decltype(bar)* a2 = nullptr;
+  }
+}
+
+namespace methods
+{
+  template<typename T>
+  struct A {
+    static void foo(int) requires (sizeof(T) == 1) {} // expected-note 3{{because 'sizeof(char [2]) == 1' (2 == 1) evaluated to false}}
+    static void bar(int) requires (sizeof(T) == 2) {} // expected-note 3{{because 'sizeof(char) == 2' (1 == 2) evaluated to false}}
+  };
+
+  void baz() {
+    A<char>::foo(1);
+    A<char>::bar(1); // expected-error{{invalid reference to function 'bar': constraints not satisfied}}
+    A<char[2]>::foo(1); // expected-error{{invalid reference to function 'foo': constraints not satisfied}}
+    A<char[2]>::bar(1);
+    void (*p1)(int) = A<char>::foo;
+    void (*p2)(int) = A<char>::bar; // expected-error{{invalid reference to function 'bar': constraints not satisfied}}
+    void (*p3)(int) = A<char[2]>::foo; // expected-error{{invalid reference to function 'foo': constraints not satisfied}}
+    void (*p4)(int) = A<char[2]>::bar;
+    decltype(A<char>::foo)* a1 = nullptr;
+    decltype(A<char>::bar)* a2 = nullptr; // expected-error{{invalid reference to function 'bar': constraints not satisfied}}
+    decltype(A<char[2]>::foo)* a3 = nullptr; // expected-error{{invalid reference to function 'foo': constraints not satisfied}}
+    decltype(A<char[2]>::bar)* a4 = nullptr;
+  }
+}
+
+namespace operators
+{
+  template<typename T>
+  struct A {
+    A<T> operator-(A<T> b) requires (sizeof(T) == 1) { return b; } // expected-note{{because 'sizeof(int) == 1' (4 == 1) evaluated to false}}
+  };
+
+  void baz() {
+    auto* x = &A<int>::operator-; // expected-error{{invalid reference to function 'operator-': constraints not satisfied}}
+    auto y = &A<char>::operator-;
+  }
+}
\ No newline at end of file
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.lambda/expr.prim.lambda.closure/p3.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.lambda/expr.prim.lambda.closure/p3.cpp
new file mode 100644
index 0000000000000..942280e1059fb
--- /dev/null
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.lambda/expr.prim.lambda.closure/p3.cpp
@@ -0,0 +1,20 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+auto l1 = [] (auto x) requires (sizeof(decltype(x)) == 1) { return x; };
+// expected-note@-1{{candidate template ignored: constraints not satisfied [with $0 = int]}}
+// expected-note@-2{{because 'sizeof(decltype(x)) == 1' (4 == 1) evaluated to false}}
+
+auto l1t1 = l1('a');
+auto l1t2 = l1(1);
+// expected-error@-1{{no matching function for call to object of type '(lambda at}}
+
+auto l2 = [] (auto... x) requires ((sizeof(decltype(x)) >= 2) && ...) { return (x + ...); };
+// expected-note@-1{{candidate template ignored: constraints not satisfied [with $0 = <char>]}}
+// expected-note@-2{{candidate template ignored: constraints not satisfied [with $0 = <int, char>]}}
+// expected-note@-3 2{{because 'sizeof(decltype(x)) >= 2' (1 >= 2) evaluated to false}}
+
+auto l2t1 = l2('a');
+// expected-error@-1{{no matching function for call to object of type '(lambda at}}
+auto l2t2 = l2(1, 'a');
+// expected-error@-1{{no matching function for call to object of type '(lambda at}}
+auto l2t3 = l2((short)1, (short)1);
\ No newline at end of file
diff --git a/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp b/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
new file mode 100644
index 0000000000000..36c68071448c7
--- /dev/null
+++ b/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
@@ -0,0 +1,113 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+template<typename T, typename U>
+constexpr static bool is_same_v = false;
+
+template<typename T>
+constexpr static bool is_same_v<T, T> = true;
+
+namespace templates
+{
+  template<typename T>
+  concept AtLeast1 = sizeof(T) >= 1;
+
+  template<typename T>
+  int foo(T t) requires (sizeof(T) == 4) { // expected-note {{candidate function}}
+    return 0;
+  }
+
+  template<typename T>
+  char foo(T t) requires AtLeast1<T> { // expected-note {{candidate function}}
+    return 'a';
+  }
+
+  template<typename T>
+  double foo(T t) requires (AtLeast1<T> && sizeof(T) <= 2) {
+    return 'a';
+  }
+
+  static_assert(is_same_v<decltype(foo(10)), int>); // expected-error {{call to 'foo' is ambiguous}}
+  static_assert(is_same_v<decltype(foo(short(10))), double>);
+
+  template<typename T>
+  void bar() requires (sizeof(T) == 1) { }
+  // expected-note@-1{{similar constraint expressions not considered equivalent}}
+  // expected-note@-2{{candidate function [with T = char]}}
+
+  template<typename T>
+  void bar() requires (sizeof(T) == 1 && sizeof(T) >= 0) { }
+  // expected-note@-1{{candidate function [with T = char]}}
+  // expected-note@-2{{similar constraint expression here}}
+
+  static_assert(is_same_v<decltype(bar<char>()), void>);
+  // expected-error@-1{{call to 'bar' is ambiguous}}
+
+  template<typename T>
+  constexpr int baz() requires AtLeast1<T> { // expected-note {{candidate function}}
+    return 1;
+  }
+
+  template<typename T> requires AtLeast1<T>
+  constexpr int baz() { // expected-note {{candidate function [with T = int]}}
+    return 2;
+  }
+
+  static_assert(baz<int>() == 1); // expected-error {{call to 'baz' is ambiguous}}
+}
+
+namespace non_template
+{
+  template<typename T>
+  concept AtLeast2 = sizeof(T) >= 2;
+
+  template<typename T>
+  concept AtMost8 = sizeof(T) <= 8;
+
+  int foo() requires AtLeast2<long> && AtMost8<long> {
+    return 0;
+  }
+
+  double foo() requires AtLeast2<long> {
+    return 0.0;
+  }
+
+  double baz() requires AtLeast2<long> && AtMost8<long> { // expected-note {{candidate function}}
+    return 0.0;
+  }
+
+  int baz() requires AtMost8<long> && AtLeast2<long> { // expected-note {{candidate function}}
+    return 0.0;
+  }
+
+  void bar() requires (sizeof(char[8]) >= 8) { }
+  // expected-note@-1 {{candidate function}}
+  // expected-note@-2 {{similar constraint expressions not considered equivalent}}
+
+  void bar() requires (sizeof(char[8]) >= 8 && sizeof(int) <= 30) { }
+  // expected-note@-1 {{candidate function}}
+  // expected-note@-2 {{similar constraint expression here}}
+
+  static_assert(is_same_v<decltype(foo()), int>);
+  static_assert(is_same_v<decltype(baz()), int>); // expected-error {{call to 'baz' is ambiguous}}
+  static_assert(is_same_v<decltype(bar()), void>); // expected-error {{call to 'bar' is ambiguous}}
+  
+  constexpr int goo(int a) requires AtLeast2<int> && true {
+    return 1;
+  }
+
+  constexpr int goo(const int b) requires AtLeast2<int> {
+    return 2;
+  }
+
+  // Only trailing requires clauses of redeclarations are compared for overload resolution.
+  constexpr int doo(int a, ...) requires AtLeast2<int> && true { // expected-note {{candidate function}}
+    return 1;
+  }
+
+  constexpr int doo(int b) requires AtLeast2<int> { // expected-note {{candidate function}}
+    return 2;
+  }
+
+  static_assert(goo(1) == 1);
+  static_assert(doo(2) == 1); // expected-error {{call to 'doo' is ambiguous}}
+}
diff --git a/clang/test/CXX/over/over.match/over.match.viable/p3.cpp b/clang/test/CXX/over/over.match/over.match.viable/p3.cpp
new file mode 100644
index 0000000000000..ef752d76ec23d
--- /dev/null
+++ b/clang/test/CXX/over/over.match/over.match.viable/p3.cpp
@@ -0,0 +1,63 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+struct S2 {};
+// expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'S1' to 'const S2' for 1st argument}}
+// expected-note@-2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'S1' to 'S2' for 1st argument}}
+// expected-note@-3 {{candidate constructor (the implicit default constructor) not viable: requires 0 arguments, but 1 was provided}}
+
+struct S1 {
+  void foo() const requires true {}
+  void foo() const requires false {}
+  void bar() const requires false {}
+  // expected-note@-1 {{because 'false' evaluated to false}}
+  operator bool() const requires true { return true; }
+  explicit operator bool() const requires false;
+  explicit operator S2() const requires false;
+  // expected-note@-1 {{candidate function not viable: constraints not satisfied}}
+  // expected-note@-2 {{because 'false' evaluated to false}}
+};
+
+void foo() {
+  S1().foo();
+  S1().bar();
+  // expected-error@-1 {{invalid reference to function 'bar': constraints not satisfied}}
+  (void) static_cast<bool>(S1());
+  (void) static_cast<S2>(S1());
+  // expected-error@-1 {{no matching conversion for static_cast from 'S1' to 'S2'}}
+}
+
+// Test that constraints are checked before implicit conversions are formed.
+
+template<typename T>
+struct invalid_template { using X = typename T::non_existant; };
+struct A {
+  template<typename T, bool=invalid_template<T>::aadasas>
+  operator T() {}
+};
+
+void foo(int) requires false;
+void foo(A) requires true;
+
+struct S {
+  void foo(int) requires false;
+  void foo(A) requires true;
+  S(A) requires false;
+  S(double) requires true;
+  ~S() requires false;
+  // expected-note@-1 2{{because 'false' evaluated to false}}
+  ~S() requires true;
+  operator int() requires true;
+  operator int() requires false;
+};
+
+void bar() {
+  foo(A{});
+  S{1.}.foo(A{});
+  // expected-error@-1{{invalid reference to function '~S': constraints not satisfied}}
+  // Note - this behavior w.r.t. constrained dtors is a consequence of current
+  // wording, which does not invoke overload resolution when a dtor is called.
+  // P0848 is set to address this issue.
+  S s = 1;
+  // expected-error@-1{{invalid reference to function '~S': constraints not satisfied}}
+  int a = s;
+}
\ No newline at end of file
diff --git a/clang/test/CXX/over/over.over/p4-2a.cpp b/clang/test/CXX/over/over.over/p4-2a.cpp
new file mode 100644
index 0000000000000..a5d7a110992cf
--- /dev/null
+++ b/clang/test/CXX/over/over.over/p4-2a.cpp
@@ -0,0 +1,61 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+template<typename T, typename U>
+constexpr static bool is_same_v = false;
+
+template<typename T>
+constexpr static bool is_same_v<T, T> = true;
+
+template<typename T>
+concept AtLeast2 = sizeof(T) >= 2;
+
+template<typename T>
+concept AtMost8 = sizeof(T) <= 8;
+
+int foo() requires AtLeast2<long> && AtMost8<long> {
+  return 0;
+}
+
+double foo() requires AtLeast2<char> {
+  return 0.0;
+}
+
+char bar() requires AtLeast2<char> { // expected-note {{possible target for call}}
+  return 1.0;
+}
+
+short bar() requires AtLeast2<long> && AtMost8<long> {
+// expected-note@-1{{possible target for call}}
+// expected-note@-2{{candidate function}}
+  return 0.0;
+}
+
+int bar() requires AtMost8<long> && AtLeast2<long> {
+// expected-note@-1{{possible target for call}}
+// expected-note@-2{{candidate function}}
+  return 0.0;
+}
+
+char baz() requires AtLeast2<char> {
+  return 1.0;
+}
+
+short baz() requires AtLeast2<long> && AtMost8<long> {
+  return 0.0;
+}
+
+int baz() requires AtMost8<long> && AtLeast2<long> {
+  return 0.0;
+}
+
+long baz() requires AtMost8<long> && AtLeast2<long> && AtLeast2<short> {
+  return 3.0;
+}
+
+void a() {
+  static_assert(is_same_v<decltype(&foo), int(*)()>);
+  static_assert(is_same_v<decltype(&bar), long(*)()>);
+  // expected-error@-1{{reference to overloaded function could not be resolved; did you mean to call it with no arguments?}}
+  // expected-error@-2{{call to 'bar' is ambiguous}}
+  static_assert(is_same_v<decltype(&baz), long(*)()>);
+}
\ No newline at end of file
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.constr/function-templates.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.constr/function-templates.cpp
index c1a3a27fbeacc..99de7261a81c8 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.constr/function-templates.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.constr/function-templates.cpp
@@ -23,14 +23,13 @@ static_assert(is_same_v<decltype(dereference<int*>(nullptr)), int>);
 static_assert(is_same_v<decltype(dereference(2)), int>); // expected-error {{no matching function for call to 'dereference'}}
 static_assert(is_same_v<decltype(dereference<char>('a')), char>); // expected-error {{no matching function for call to 'dereference'}}
 
-
-template<typename T> requires T{} + T{} // expected-note {{because substituted constraint expression is ill-formed: invalid operands to binary expression ('A' and 'A')}}
+template<typename T> requires (T{} + T{}) // expected-note {{because substituted constraint expression is ill-formed: invalid operands to binary expression ('A' and 'A')}}
 auto foo(T t) { // expected-note {{candidate template ignored: constraints not satisfied [with T = A]}}
   return t + t;
 }
 
 
-template<typename T> requires !((T{} - T{}) && (T{} + T{})) || false
+template<typename T> requires (!((T{} - T{}) && (T{} + T{})) || false)
 // expected-note@-1{{because substituted constraint expression is ill-formed: invalid operands to binary expression ('A' and 'A')}}
 // expected-note@-2{{and 'false' evaluated to false}}
 auto bar(T t) { // expected-note {{candidate template ignored: constraints not satisfied [with T = A]}}
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.constr/non-function-templates.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.constr/non-function-templates.cpp
index 24caa5063a1b4..a25b22a9a1544 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.constr/non-function-templates.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.constr/non-function-templates.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ -verify %s
 
-template<typename T> requires sizeof(T) >= 2 // expected-note{{because 'sizeof(char) >= 2' (1 >= 2) evaluated to false}}
+template<typename T> requires (sizeof(T) >= 2) // expected-note{{because 'sizeof(char) >= 2' (1 >= 2) evaluated to false}}
 struct A {
   static constexpr int value = sizeof(T);
 };
@@ -9,8 +9,8 @@ static_assert(A<int>::value == 4);
 static_assert(A<char>::value == 1); // expected-error{{constraints not satisfied for class template 'A' [with T = char]}}
 
 template<typename T, typename U>
-  requires sizeof(T) != sizeof(U) // expected-note{{because 'sizeof(int) != sizeof(char [4])' (4 != 4) evaluated to false}}
-           && sizeof(T) >= 4 // expected-note{{because 'sizeof(char) >= 4' (1 >= 4) evaluated to false}}
+  requires (sizeof(T) != sizeof(U) // expected-note{{because 'sizeof(int) != sizeof(char [4])' (4 != 4) evaluated to false}}
+            && sizeof(T) >= 4) // expected-note{{because 'sizeof(char) >= 4' (1 >= 4) evaluated to false}}
 constexpr int SizeDiff = sizeof(T) > sizeof(U) ? sizeof(T) - sizeof(U) : sizeof(U) - sizeof(T);
 
 static_assert(SizeDiff<int, char> == 3);
@@ -44,16 +44,16 @@ static_assert(S<S2>::value);
 template<typename T>
 struct AA
 {
-    template<typename U> requires sizeof(U) == sizeof(T) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
+    template<typename U> requires (sizeof(U) == sizeof(T)) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
     struct B
     {
         static constexpr int a = 0;
     };
 
-    template<typename U> requires sizeof(U) == sizeof(T) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
+    template<typename U> requires (sizeof(U) == sizeof(T)) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
     static constexpr int b = 1;
 
-    template<typename U> requires sizeof(U) == sizeof(T) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
+    template<typename U> requires (sizeof(U) == sizeof(T)) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
     static constexpr int getB() { // expected-note{{candidate template ignored: constraints not satisfied [with U = int [2]]}}
         return 2;
     }
@@ -85,8 +85,8 @@ template<typename T> requires B<T>::type // expected-note{{in instantiation of t
                                          // expected-note@-1{{while substituting template arguments into constraint expression here}}
 struct C { };
 
-template<typename T> requires T{} // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
+template<typename T> requires (T{}) // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
 struct D { };
 
 static_assert(C<int>{}); // expected-note{{while checking constraint satisfaction for template 'C<int>' required here}}
-static_assert(D<int>{}); // expected-note{{while checking constraint satisfaction for template 'D<int>' required here}}
\ No newline at end of file
+static_assert(D<int>{}); // expected-note{{while checking constraint satisfaction for template 'D<int>' required here}}
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.constr/partial-specializations.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.constr/partial-specializations.cpp
index 47bd2a5507690..1ea4da29ee9f5 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.constr/partial-specializations.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.constr/partial-specializations.cpp
@@ -2,10 +2,10 @@
 
 namespace class_templates
 {
-  template<typename T, typename U> requires sizeof(T) >= 4 // expected-note {{because 'sizeof(char) >= 4' (1 >= 4) evaluated to false}}
+  template<typename T, typename U> requires (sizeof(T) >= 4) // expected-note {{because 'sizeof(char) >= 4' (1 >= 4) evaluated to false}}
   struct is_same { static constexpr bool value = false; };
 
-  template<typename T> requires sizeof(T*) >= 4 && sizeof(T) >= 4
+  template<typename T> requires (sizeof(T*) >= 4 && sizeof(T) >= 4)
   struct is_same<T*, T*> { static constexpr bool value = true; };
 
   static_assert(!is_same<char*, char*>::value);
@@ -23,7 +23,7 @@ namespace class_templates
                                            // expected-note@-1{{while substituting template arguments into constraint expression here}}
   struct B<T*> {};
 
-  template<typename T> requires T{} // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
+  template<typename T> requires (T{}) // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
   struct B<T**> {};
 
   static_assert((B<int**>{}, true)); // expected-note{{while checking constraint satisfaction for class template partial specialization 'B<int *>' required here}}
@@ -35,10 +35,10 @@ namespace class_templates
 
 namespace variable_templates
 {
-  template<typename T, typename U> requires sizeof(T) >= 4
+  template<typename T, typename U> requires (sizeof(T) >= 4)
   constexpr bool is_same_v = false;
 
-  template<typename T> requires sizeof(T*) >= 4 && sizeof(T) >= 4
+  template<typename T> requires (sizeof(T*) >= 4 && sizeof(T) >= 4)
   constexpr bool is_same_v<T*, T*> = true;
 
   static_assert(!is_same_v<char*, char*>);
@@ -55,7 +55,7 @@ namespace variable_templates
                                            // expected-note@-1{{while substituting template arguments into constraint expression here}}
   constexpr bool v1<T*> = true;
 
-  template<typename T> requires T{} // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
+  template<typename T> requires (T{}) // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
   constexpr bool v1<T**> = true;
 
   static_assert(v1<int**>); // expected-note{{while checking constraint satisfaction for variable template partial specialization 'v1<int *>' required here}}
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.decl/class-template-decl.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.decl/class-template-decl.cpp
index 5d5361f9c20c3..6f7b80e26a66a 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.decl/class-template-decl.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.decl/class-template-decl.cpp
@@ -2,9 +2,9 @@
 
 namespace nodiag {
 
-template <typename T> requires bool(T())
+template <typename T> requires (bool(T()))
 struct A;
-template <typename U> requires bool(U())
+template <typename U> requires (bool(U()))
 struct A;
 
 } // end namespace nodiag
@@ -21,7 +21,7 @@ struct B;
 
 template <typename T> requires true // expected-note{{previous template declaration is here}}
 struct C;
-template <typename T> requires !0 // expected-error{{requires clause differs in template redeclaration}}
+template <typename T> requires (!0) // expected-error{{requires clause differs in template redeclaration}}
 struct C;
 
 } // end namespace diag
@@ -29,15 +29,15 @@ struct C;
 namespace nodiag {
 
 struct AA {
-  template <typename T> requires someFunc(T())
+  template <typename T> requires (someFunc(T()))
   struct A;
 };
 
-template <typename U> requires someFunc(U())
+template <typename U> requires (someFunc(U()))
 struct AA::A { };
 
 struct AAF {
-  template <typename T> requires someFunc(T())
+  template <typename T> requires (someFunc(T()))
   friend struct AA::A;
 };
 
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.decl/func-template-decl.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.decl/func-template-decl.cpp
index c83ab26059d7c..30fbec64eea78 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.decl/func-template-decl.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.decl/func-template-decl.cpp
@@ -2,9 +2,9 @@
 
 namespace nodiag {
 
-template <typename T> requires bool(T())
+template <typename T> requires (bool(T()))
 int A();
-template <typename U> requires bool(U())
+template <typename U> requires (bool(U()))
 int A();
 
 } // end namespace nodiag
@@ -26,7 +26,7 @@ int orig::A();
 template <typename T> requires true
 int orig::B();
 // expected-error@-1{{out-of-line declaration of 'B' does not match any declaration in namespace 'diag::orig'}}
-template <typename T> requires !0
+template <typename T> requires (!0)
 int orig::C();
 // expected-error@-1{{out-of-line declaration of 'C' does not match any declaration in namespace 'diag::orig'}}
 
@@ -35,11 +35,11 @@ int orig::C();
 namespace nodiag {
 
 struct AA {
-  template <typename T> requires someFunc(T())
+  template <typename T> requires (someFunc(T()))
   int A();
 };
 
-template <typename T> requires someFunc(T())
+template <typename T> requires (someFunc(T()))
 int AA::A() { return sizeof(T); }
 
 } // end namespace nodiag
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.decl/var-template-decl.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.decl/var-template-decl.cpp
index cf6874f12d3f5..eabb636b0bbbf 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.decl/var-template-decl.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.decl/var-template-decl.cpp
@@ -3,11 +3,11 @@
 namespace nodiag {
 
 struct B {
-    template <typename T> requires bool(T())
+    template <typename T> requires (bool(T()))
     static int A;
 };
 
-template <typename U> requires bool(U())
+template <typename U> requires (bool(U()))
 int B::A = int(U());
 
 } // end namespace nodiag
@@ -15,11 +15,11 @@ int B::A = int(U());
 namespace diag {
 
 struct B {
-    template <typename T> requires bool(T()) // expected-note{{previous template declaration is here}}
+    template <typename T> requires (bool(T())) // expected-note{{previous template declaration is here}}
     static int A;
 };
 
-template <typename U> requires !bool(U())  // expected-error{{requires clause differs in template redeclaration}}
+template <typename U> requires (!bool(U()))  // expected-error{{requires clause differs in template redeclaration}}
 int B::A = int(U());
 
 } // end namespace diag
\ No newline at end of file
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.order/class-template-partial-specializations.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.order/class-template-partial-specializations.cpp
index 8c2f552694173..5d41035aa88d7 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.order/class-template-partial-specializations.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.order/class-template-partial-specializations.cpp
@@ -1,9 +1,12 @@
 // RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ -verify %s
 
-template<typename T> requires sizeof(T) >= 4
+template<typename T> requires (sizeof(T) >= 4)
+// expected-note@-1{{similar constraint expressions not considered equivalen}}
 class A{}; // expected-note{{template is declared here}}
 
-template<typename T> requires sizeof(T) >= 4 && sizeof(T) <= 10
+template<typename T> requires (sizeof(T) >= 4 && sizeof(T) <= 10)
+// expected-note@-1{{similar constraint expression here}}
+
 class A<T>{}; // expected-error{{class template partial specialization is not more specialized than the primary template}}
 
 template<typename T>
@@ -12,7 +15,7 @@ concept C1 = sizeof(T) >= 4;
 template<typename T> requires C1<T>
 class B{};
 
-template<typename T> requires C1<T> && sizeof(T) <= 10
+template<typename T> requires (C1<T> && sizeof(T) <= 10)
 class B<T>{};
 
 template<typename T>
@@ -48,3 +51,15 @@ struct F<T>{ enum{ value = 3 }; };
 static_assert(F<unsigned>::value == 2);
 static_assert(F<char[10]>::value == 3);
 static_assert(F<char>::value == 1);
+
+// Make sure atomic constraints subsume each other only if their parameter
+// mappings are identical.
+
+template<typename T, typename U> requires C2<T>
+struct I { }; // expected-note {{template is declared here}}
+
+template<typename T, typename U> requires C2<U>
+struct I<T, U> { }; // expected-error {{class template partial specialization is not more specialized than the primary template}}
+
+template<typename T, typename U> requires C2<T> && C2<U>
+struct I<T, U> { };
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp
index cc578fe0ad62c..7f68369d52842 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp
@@ -1,9 +1,11 @@
 // RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ -verify %s
 
-template<typename T> requires sizeof(T) >= 4
+template<typename T> requires (sizeof(T) >= 4)
+// expected-note@-1{{similar constraint expressions not considered equivalent}}
 bool a() { return false; } // expected-note {{candidate function [with T = unsigned int]}}
 
-template<typename T> requires sizeof(T) >= 4 && sizeof(T) <= 10
+template<typename T> requires (sizeof(T) >= 4 && sizeof(T) <= 10)
+// expected-note@-1{{similar constraint expression here}}
 bool a() { return true; } // expected-note {{candidate function [with T = unsigned int]}}
 
 bool av = a<unsigned>(); // expected-error {{call to 'a' is ambiguous}}
@@ -14,7 +16,7 @@ concept C1 = sizeof(T) >= 4;
 template<typename T> requires C1<T>
 constexpr bool b() { return false; }
 
-template<typename T> requires C1<T> && sizeof(T) <= 10
+template<typename T> requires (C1<T> && sizeof(T) <= 10)
 constexpr bool b() { return true; }
 
 static_assert(b<int>());
@@ -86,4 +88,4 @@ static_assert(sizeof(g<int>()));
 template <unsigned> struct X {};
 template <class...> int h(X<0>);
 template <unsigned b, class...> int h(X<b>);
-static_assert(sizeof(h(X<0>{})));
\ No newline at end of file
+static_assert(sizeof(h(X<0>{})));
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.order/var-template-partial-specializations.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.order/var-template-partial-specializations.cpp
index b40c77e70a194..cf88e34036dc7 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.order/var-template-partial-specializations.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.order/var-template-partial-specializations.cpp
@@ -1,9 +1,11 @@
 // RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ -verify %s
 
-template<typename T> requires sizeof(T) >= 4
+template<typename T> requires (sizeof(T) >= 4)
+// expected-note@-1{{similar constraint expressions not considered equivalent}}
 bool a = false; // expected-note{{template is declared here}}
 
-template<typename T> requires sizeof(T) >= 4 && sizeof(T) <= 10
+template<typename T> requires (sizeof(T) >= 4 && sizeof(T) <= 10)
+// expected-note@-1{{similar constraint expression here}}
 bool a<T> = true; // expected-error{{variable template partial specialization is not more specialized than the primary template}}
 
 template<typename T>
@@ -12,7 +14,7 @@ concept C1 = sizeof(T) >= 4;
 template<typename T> requires C1<T>
 bool b = false;
 
-template<typename T> requires C1<T> && sizeof(T) <= 10
+template<typename T> requires (C1<T> && sizeof(T) <= 10)
 bool b<T> = true;
 
 template<typename T>
diff --git a/clang/test/CXX/temp/temp.explicit/p8.cpp b/clang/test/CXX/temp/temp.explicit/p8.cpp
new file mode 100644
index 0000000000000..72d2255789960
--- /dev/null
+++ b/clang/test/CXX/temp/temp.explicit/p8.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ -verify %s
+
+template<typename T, typename S = char> requires (sizeof(T) + sizeof(S) < 10)
+// expected-note@-1{{because 'sizeof(char [100]) + sizeof(char) < 10' (101 < 10) evaluated to false}}
+void f(T t, S s) requires (sizeof(t) == 1 && sizeof(s) == 1) { };
+// expected-note@-1{{candidate template ignored: constraints not satisfied [with T = int, S = char]}}
+// expected-note@-2{{because 'sizeof (t) == 1' (4 == 1) evaluated to false}}
+// expected-note@-3{{candidate template ignored: constraints not satisfied [with T = char, S = short]}}
+// expected-note@-4{{because 'sizeof (s) == 1' (2 == 1) evaluated to false}}
+// expected-note@-5{{candidate template ignored: constraints not satisfied [with T = char [100], S = char]}}
+
+template<>
+void f<int>(int t, char s) { };
+// expected-error@-1{{no function template matches function template specialization 'f'}}
+
+template<>
+void f<char, short>(char t, short s) { };
+// expected-error@-1{{no function template matches function template specialization 'f'}}
+
+template<>
+void f<char[100]>(char t[100], char s) { };
+// expected-error@-1{{no function template matches function template specialization 'f'}}
\ No newline at end of file
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
index 984d5989217e1..a0f37fe65d3de 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
@@ -98,3 +98,45 @@ void test_vst2q_f16(float16_t *addr, float16x8x2_t value)
     vst2q_f16(addr, value);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @load_into_variable(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.mve.vld2q.v8i16.p0i16(i16* [[ADDR:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] undef, <8 x i16> [[TMP1]], 0, 0
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[TMP2]], <8 x i16> [[TMP3]], 0, 1
+// CHECK-NEXT:    store <8 x i16> [[TMP1]], <8 x i16>* [[VALUES:%.*]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[VALUES]], i32 1
+// CHECK-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    ret void
+//
+void load_into_variable(const uint16_t *addr, uint16x8_t *values)
+{
+    uint16x8x2_t v;
+#ifdef POLYMORPHIC
+    v = vld2q(addr);
+#else /* POLYMORPHIC */
+    v = vld2q_u16(addr);
+#endif /* POLYMORPHIC */
+    values[0] = v.val[0];
+    values[1] = v.val[1];
+}
+
+// CHECK-LABEL: @extract_one_vector(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[ADDR:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] undef, <4 x i32> [[TMP1]], 0, 0
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[TMP2]], <4 x i32> [[TMP3]], 0, 1
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+int32x4_t extract_one_vector(const int32_t *addr)
+{
+#ifdef POLYMORPHIC
+    return vld2q(addr).val[0];
+#else /* POLYMORPHIC */
+    return vld2q_s32(addr).val[0];
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-target-features.c b/clang/test/CodeGen/arm-target-features.c
index 03719f8a9e5dc..11fe4e505439f 100644
--- a/clang/test/CodeGen/arm-target-features.c
+++ b/clang/test/CodeGen/arm-target-features.c
@@ -1,23 +1,23 @@
 // REQUIRES: arm-registered-target
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu cortex-a8 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3
-// CHECK-VFP3: "target-features"="+armv7-a,+d32,+dsp,+fp64,+fpregs,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp"
+// CHECK-VFP3: "target-features"="+armv7-a,+d32,+dsp,+fp64,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu cortex-a5 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4
-// CHECK-VFP4: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+fpregs,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
+// CHECK-VFP4: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu cortex-a7 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-a12 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV
 // RUN: %clang_cc1 -triple thumbv7s-linux-gnueabi -target-cpu swift -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV-2
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu krait -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV
-// CHECK-VFP4-DIV: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
-// CHECK-VFP4-DIV-2: "target-features"="+armv7s,+d32,+dsp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
+// CHECK-VFP4-DIV: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
+// CHECK-VFP4-DIV-2: "target-features"="+armv7s,+d32,+dsp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
 
 // RUN: %clang_cc1 -triple armv7-linux-gnueabihf -target-cpu cortex-a15 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV-ARM
 // RUN: %clang_cc1 -triple armv7-linux-gnueabihf -target-cpu cortex-a17 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV-ARM
-// CHECK-VFP4-DIV-ARM: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+neon,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp,-thumb-mode"
+// CHECK-VFP4-DIV-ARM: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+neon,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp,-thumb-mode"
 
 // RUN: %clang_cc1 -triple thumbv7s-apple-ios7.0 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
@@ -26,34 +26,34 @@
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a72 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a73 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
-// CHECK-BASIC-V8: "target-features"="+armv8-a,+crc,+crypto,+d32,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
+// CHECK-BASIC-V8: "target-features"="+armv8-a,+crc,+crypto,+d32,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
 
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V82
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m5 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V82
-// CHECK-BASIC-V82: "target-features"="+armv8.2-a,+crc,+crypto,+d32,+dotprod,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+fpregs,+fullfp16,+hwdiv,+hwdiv-arm,+neon,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
+// CHECK-BASIC-V82: "target-features"="+armv8.2-a,+crc,+crypto,+d32,+dotprod,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+fullfp16,+hwdiv,+hwdiv-arm,+neon,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
 
 // RUN: %clang_cc1 -triple armv8-linux-gnueabi -target-cpu cortex-a53 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8-ARM
-// CHECK-BASIC-V8-ARM: "target-features"="+armv8-a,+crc,+crypto,+d32,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+neon,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp,-thumb-mode"
+// CHECK-BASIC-V8-ARM: "target-features"="+armv8-a,+crc,+crypto,+d32,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+neon,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp,-thumb-mode"
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r5 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-DIV
-// CHECK-VFP3-D16-DIV: "target-features"="+armv7-r,+dsp,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp"
+// CHECK-VFP3-D16-DIV: "target-features"="+armv7-r,+dsp,+fp64,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp"
 
 
 // RUN: %clang_cc1 -triple armv7-linux-gnueabi -target-cpu cortex-r4f -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-THUMB-DIV
-// CHECK-VFP3-D16-THUMB-DIV: "target-features"="+armv7-r,+dsp,+fp64,+fpregs,+hwdiv,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,-thumb-mode"
+// CHECK-VFP3-D16-THUMB-DIV: "target-features"="+armv7-r,+dsp,+fp64,+hwdiv,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,-thumb-mode"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r7 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-FP16-DIV
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r8 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-FP16-DIV
-// CHECK-VFP3-D16-FP16-DIV: "target-features"="+armv7-r,+dsp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp"
+// CHECK-VFP3-D16-FP16-DIV: "target-features"="+armv7-r,+dsp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-m4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-D16-SP-THUMB-DIV
-// CHECK-VFP4-D16-SP-THUMB-DIV: "target-features"="+armv7e-m,+dsp,+fp16,+fpregs,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp"
+// CHECK-VFP4-D16-SP-THUMB-DIV: "target-features"="+armv7e-m,+dsp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-m7 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP5-D16-THUMB-DIV
-// CHECK-VFP5-D16-THUMB-DIV: "target-features"="+armv7e-m,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fpregs,+hwdiv,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp"
+// CHECK-VFP5-D16-THUMB-DIV: "target-features"="+armv7e-m,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+hwdiv,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp"
 
 
 // RUN: %clang_cc1 -triple armv7-linux-gnueabi -target-cpu cortex-r4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-THUMB-DIV
@@ -105,6 +105,6 @@
 // CHECK-ARMV8M-M23-LINUX: "target-features"="+armv8-m.base,+hwdiv,+thumb-mode"
 
 // RUN: %clang_cc1 -triple thumb-linux-gnueabi -target-cpu cortex-m33 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV8M-MAIN-LINUX 
-// CHECK-ARMV8M-MAIN-LINUX: "target-features"="+armv8-m.main,+dsp,+fp-armv8d16sp,+fp16,+fpregs,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp"
+// CHECK-ARMV8M-MAIN-LINUX: "target-features"="+armv8-m.main,+dsp,+fp-armv8d16sp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp"
 
 void foo() {}
diff --git a/clang/test/CodeGen/mempcpy-libcall.c b/clang/test/CodeGen/mempcpy-libcall.c
new file mode 100644
index 0000000000000..b88f494f164df
--- /dev/null
+++ b/clang/test/CodeGen/mempcpy-libcall.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -emit-llvm < %s| FileCheck %s
+
+typedef __SIZE_TYPE__ size_t;
+
+void *mempcpy(void *, void const *, size_t);
+
+char *test(char *d, char *s, size_t n) {
+  // CHECK:      call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}} %[[REG1:[^ ]+]], i8* {{.*}} %1, i64 %[[REG2:[^ ]+]], i1 false)
+  // CHECK-NEXT: %[[REGr:[^ ]+]] = getelementptr inbounds i8, i8* %[[REG1]], i64 %[[REG2]]
+  // CHECK-NEXT: ret i8* %[[REGr]]
+  return mempcpy(d, s, n);
+}
diff --git a/clang/test/Driver/arm-mfpu.c b/clang/test/Driver/arm-mfpu.c
index b709622f72075..c3731fa5bd635 100644
--- a/clang/test/Driver/arm-mfpu.c
+++ b/clang/test/Driver/arm-mfpu.c
@@ -84,7 +84,7 @@
 // CHECK-VFP3-D16-DAG: "-target-feature" "-vfp4d16sp"
 // CHECK-VFP3-D16-DAG: "-target-feature" "-fp-armv8d16sp"
 // CHECK-VFP3-D16-DAG: "-target-feature" "+fp64"
-// CHECK-VFP3-D16-NOT: "-target-feature" "+d32"
+// CHECK-VFP3-D16-DAG: "-target-feature" "-d32"
 // CHECK-VFP3-D16-DAG: "-target-feature" "-neon"
 
 // RUN: %clang -target arm-linux-eabi -mfpu=vfpv3-d16-fp16 %s -### -o %t.o 2>&1 \
@@ -98,7 +98,7 @@
 // CHECK-VFP3-D16-FP16-DAG: "-target-feature" "-vfp4d16sp"
 // CHECK-VFP3-D16-FP16-DAG: "-target-feature" "-fp-armv8d16sp"
 // CHECK-VFP3-D16-FP16-DAG: "-target-feature" "+fp64"
-// CHECK-VFP3-D16-FP16-NOT: "-target-feature" "+d32"
+// CHECK-VFP3-D16-FP16-DAG: "-target-feature" "-d32"
 // CHECK-VFP3-D16-FP16-DAG: "-target-feature" "-neon"
 // CHECK-VFP3-D16-FP16-DAG: "-target-feature" "-crypto"
 
@@ -108,8 +108,8 @@
 // RUN:   | FileCheck --check-prefix=CHECK-SOFT-ABI-FP-3 %s
 // CHECK-VFP3XD-NOT: "-target-feature" "+soft-float"
 // CHECK-VFP3XD-DAG: "-target-feature" "+soft-float-abi"
-// CHECK-VFP3XD-NOT: "-target-feature" "+fp64"
-// CHECK-VFP3XD-NOT: "-target-feature" "+d32"
+// CHECK-VFP3XD-DAG: "-target-feature" "-fp64"
+// CHECK-VFP3XD-DAG: "-target-feature" "-d32"
 // CHECK-VFP3XD-DAG: "-target-feature" "+vfp3d16sp"
 // CHECK-VFP3XD-DAG: "-target-feature" "-fp16"
 // CHECK-VFP3XD-DAG: "-target-feature" "-vfp4d16sp"
@@ -127,8 +127,8 @@
 // CHECK-VFP3XD-FP16-DAG: "-target-feature" "+fp16"
 // CHECK-VFP3XD-FP16-DAG: "-target-feature" "-vfp4d16sp"
 // CHECK-VFP3XD-FP16-DAG: "-target-feature" "-fp-armv8d16sp"
-// CHECK-VFP3XD-FP16-NOT: "-target-feature" "+fp64"
-// CHECK-VFP3XD-FP16-NOT: "-target-feature" "+d32"
+// CHECK-VFP3XD-FP16-DAG: "-target-feature" "-fp64"
+// CHECK-VFP3XD-FP16-DAG: "-target-feature" "-d32"
 // CHECK-VFP3XD-FP16-DAG: "-target-feature" "-neon"
 // CHECK-VFP3XD-FP16-DAG: "-target-feature" "-crypto"
 
@@ -162,7 +162,7 @@
 // CHECK-VFP4-D16-DAG: "-target-feature" "+vfp4d16"
 // CHECK-VFP4-D16-DAG: "-target-feature" "-fp-armv8d16sp"
 // CHECK-VFP4-D16-DAG: "-target-feature" "+fp64"
-// CHECK-VFP4-D16-NOT: "-target-feature" "+d32"
+// CHECK-VFP4-D16-DAG: "-target-feature" "-d32"
 // CHECK-VFP4-D16-DAG: "-target-feature" "-neon"
 
 // RUN: %clang -target arm-linux-eabi -mfpu=fp4-sp-d16 %s -### -o %t.o 2>&1 \
@@ -175,8 +175,8 @@
 // CHECK-FP4-SP-D16-DAG: "-target-feature" "+soft-float-abi"
 // CHECK-FP4-SP-D16-DAG: "-target-feature" "+vfp4d16sp"
 // CHECK-FP4-SP-D16-DAG: "-target-feature" "-fp-armv8d16sp"
-// CHECK-FP4-SP-D16-NOT: "-target-feature" "+fp64"
-// CHECK-FP4-SP-D16-NOT: "-target-feature" "+d32"
+// CHECK-FP4-SP-D16-DAG: "-target-feature" "-fp64"
+// CHECK-FP4-SP-D16-DAG: "-target-feature" "-d32"
 // CHECK-FP4-SP-D16-DAG: "-target-feature" "-neon"
 
 // RUN: %clang -target arm-linux-eabi -mfpu=fp5-sp-d16 %s -### -o %t.o 2>&1 \
@@ -189,8 +189,8 @@
 // CHECK-FP5-SP-D16-DAG: "-target-feature" "+soft-float-abi"
 // CHECK-FP5-SP-D16-DAG: "-target-feature" "+fp-armv8d16sp"
 // CHECK-FP5-SP-D16-DAG: "-target-feature" "-neon"
-// CHECK-FP5-SP-D16-NOT: "-target-feature" "+fp64"
-// CHECK-FP5-SP-D16-NOT: "-target-feature" "+d32"
+// CHECK-FP5-SP-D16-DAG: "-target-feature" "-fp64"
+// CHECK-FP5-SP-D16-DAG: "-target-feature" "-d32"
 // CHECK-FP5-SP-D16-DAG: "-target-feature" "-crypto"
 
 // RUN: %clang -target arm-linux-eabi -mfpu=fp5-dp-d16 %s -### -o %t.o 2>&1 \
@@ -203,7 +203,7 @@
 // CHECK-FP5-DP-D16-DAG: "-target-feature" "+soft-float-abi"
 // CHECK-FP5-DP-D16-DAG: "-target-feature" "+fp-armv8d16"
 // CHECK-FP5-DP-D16-DAG: "-target-feature" "+fp64"
-// CHECK-FP5-DP-D16-NOT: "-target-feature" "+d32"
+// CHECK-FP5-DP-D16-DAG: "-target-feature" "-d32"
 // CHECK-FP5-DP-D16-DAG: "-target-feature" "-neon"
 // CHECK-FP5-DP-D16-DAG: "-target-feature" "-crypto"
 // CHECK-SOFT-ABI-FP-5-DAG: "-target-feature" "+soft-float"
@@ -323,8 +323,8 @@
 // CHECK-NO-FP-DAG: "-target-feature" "-vfp3d16sp"
 // CHECK-NO-FP-DAG: "-target-feature" "-vfp4d16sp"
 // CHECK-NO-FP-DAG: "-target-feature" "-fp-armv8d16sp"
-// CHECK-NO-FP-NOT: "-target-feature" "+fp64"
-// CHECK-NO-FP-NOT: "-target-feature" "+d32"
+// CHECK-NO-FP-DAG: "-target-feature" "-fp64"
+// CHECK-NO-FP-DAG: "-target-feature" "-d32"
 // CHECK-NO-FP-DAG: "-target-feature" "-neon"
 // CHECK-NO-FP-DAG: "-target-feature" "-crypto"
 
@@ -382,8 +382,8 @@
 // CHECK-ARM7-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+soft-float"
 // CHECK-ARM7-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+soft-float-abi"
 // CHECK-ARM7-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+vfp3"
-// CHECK-ARM7-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+vfp4"
-// CHECK-ARM7-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+fp-armv8"
+// CHECK-ARM7-ANDROID-FP-DEFAULT-DAG: "-target-feature" "-vfp4"
+// CHECK-ARM7-ANDROID-FP-DEFAULT-DAG: "-target-feature" "-fp-armv8"
 // CHECK-ARM7-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+neon"
 // CHECK-ARM7-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+crypto"
 
@@ -391,7 +391,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-ARM7-ANDROID-FP-D16 %s
 // CHECK-ARM7-ANDROID-FP-D16-NOT: "-target-feature" "+soft-float"
 // CHECK-ARM7-ANDROID-FP-D16-DAG: "-target-feature" "+soft-float-abi"
-// CHECK-ARM7-ANDROID-FP-D16-NOT: "-target-feature" "+d32"
+// CHECK-ARM7-ANDROID-FP-D16-DAG: "-target-feature" "-d32"
 // CHECK-ARM7-ANDROID-FP-D16-DAG: "-target-feature" "+vfp3d16"
 // CHECK-ARM7-ANDROID-FP-D16-NOT: "-target-feature" "+vfp4"
 // CHECK-ARM7-ANDROID-FP-D16-NOT: "-target-feature" "+fp-armv8"
@@ -403,3 +403,23 @@
 // CHECK-SOFTFLOATABI-INHIBITS-MVE-NOT: "-target-feature" "+mve"
 // CHECK-SOFTFLOATABI-INHIBITS-MVE-DAG: "-target-feature" "-mve"
 // CHECK-SOFTFLOATABI-INHIBITS-MVE-DAG: "-target-feature" "-mve.fp"
+
+// RUN: %clang -target arm-none-none-eabi %s -march=armv8.1-m.main+mve.fp -mfpu=none -### -c 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MVEFP-FPUNONE %s
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-vfp2sp"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-vfp3d16sp"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-vfp4d16sp"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-fp-armv8d16sp"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-fp64"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-d32"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-neon"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-crypto"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-mve.fp"
+// CHECK-MVEFP-FPUNONE-NOT: "-target-feature" "-fpregs"
+
+
+// RUN: %clang -target arm-none-none-eabi %s -march=armv8.1-m.main+mve -mfpu=none -### -c 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MVEI-FPUNONE %s
+// CHECK-MVEI-FPUNONE-DAG: "-target-feature" "-mve.fp"
+// CHECK-MVEI-FPUNONE-DAG: "-target-feature" "+mve"
+// CHECK-MVEI-FPUNONE-NOT: "-target-feature" "-fpregs"
diff --git a/clang/test/Parser/cxx-concepts-ambig-constraint-expr.cpp b/clang/test/Parser/cxx-concepts-ambig-constraint-expr.cpp
index 12ab338a6b00a..1cd2605ce0556 100644
--- a/clang/test/Parser/cxx-concepts-ambig-constraint-expr.cpp
+++ b/clang/test/Parser/cxx-concepts-ambig-constraint-expr.cpp
@@ -5,25 +5,5 @@
 // the syntax is consumed without backtracking.
 
 // type-specifier-seq in conversion-type-id
-template <typename T> requires (bool)&T::operator short
-unsigned int foo(); // expected-error {{C++ requires a type specifier for all declarations}}
-
-// type-specifier-seq in new-type-id
-template <typename T> requires (bool)sizeof new (T::f()) short
-unsigned int bar(); // expected-error {{C++ requires a type specifier for all declarations}}
-
-template<typename T> requires (bool)sizeof new (T::f()) unsigned // expected-error {{'struct' cannot be signed or unsigned}}
-struct X { }; // expected-error {{'X' cannot be defined in a type specifier}}
-
-// C-style cast
-// of function call on function-style cast
-template <typename T> requires (bool(T()))
-T (*fp)(); // expected-error {{use of undeclared identifier 'fp'}}
-
-// function-style cast
-// as the callee in a function call
-struct A {
-  static int t;
-  template <typename T> requires bool(T())
-  (A(T (&t))) { } // expected-error {{called object type 'bool' is not a function or function pointer}}
-};
+template <typename T> requires T::operator short
+unsigned int foo(); // expected-error {{C++ requires a type specifier for all declarations}}
\ No newline at end of file
diff --git a/clang/test/Parser/cxx-concepts-requires-clause.cpp b/clang/test/Parser/cxx-concepts-requires-clause.cpp
index 01893a94cbc94..60e7004e08187 100644
--- a/clang/test/Parser/cxx-concepts-requires-clause.cpp
+++ b/clang/test/Parser/cxx-concepts-requires-clause.cpp
@@ -1,13 +1,11 @@
-// RUN: %clang_cc1 -std=c++14 -fconcepts-ts -x c++ %s -verify
-// expected-no-diagnostics
+// RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ %s -verify
 
 // Test parsing of the optional requires-clause in a template-declaration.
 
 template <typename T> requires true
 void foo() { }
 
-
-template <typename T> requires !0
+template <typename T> requires (!0)
 struct A {
   void foo();
   struct AA;
@@ -27,31 +25,30 @@ struct A {
   using MQ = M<TT>;
 };
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 void A<T>::foo() { }
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 struct A<T>::AA { };
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 enum A<T>::E : int { E0 };
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 int A<T>::x = 0;
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 template <typename> requires true
 void A<T>::Mfoo() { }
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 template <typename> requires true
 struct A<T>::M { };
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 template <typename> requires true
 int A<T>::Mx = 0;
 
-
 template <typename T> requires true
 int x = 0;
 
@@ -80,3 +77,81 @@ struct C::M { };
 
 template <typename> requires true
 int C::Mx = 0;
+
+// Test behavior with non-primary-expression requires clauses
+
+template<typename T> requires foo<T>()
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+struct B1 { };
+
+int func() { }
+
+template<typename T> requires func()
+// expected-error@-1{{atomic constraint must be of type 'bool' (found '<overloaded function type>')}}
+// expected-note@-2{{parentheses are required around this expression in a requires clause}}
+struct B2 { };
+
+template<typename T> requires (foo<T>())
+struct B3 { };
+
+template<typename T> requires T{}
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+struct B4 { };
+
+template<typename T> requires sizeof(T) == 0
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+struct B5 { };
+
+template<typename T> requires (sizeof(T)) == 0
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+struct B6 { };
+
+template<typename T> requires 0
+// expected-error@-1{{atomic constraint must be of type 'bool' (found 'int')}}
+(int) bar() { };
+
+template<typename T> requires foo<T>
+(int) bar() { };
+// expected-error@-1{{expected '(' for function-style cast or type construction}}
+
+template<typename T>
+void bar() requires foo<T>();
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+
+template<typename T>
+void bar() requires (foo<T>());
+
+template<typename T>
+void bar() requires func();
+// expected-error@-1{{atomic constraint must be of type 'bool' (found '<overloaded function type>')}}
+// expected-note@-2{{parentheses are required around this expression in a requires clause}}
+
+template<typename T>
+void bar() requires T{};
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+
+template<typename T>
+void bar() requires sizeof(T) == 0;
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+
+template<typename T>
+void bar() requires (sizeof(T)) == 0;
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+
+void bar(int x, int y) requires (x, y, true);
+
+struct B {
+  int x;
+  void foo(int y) requires (x, this, this->x, y, true);
+  static void bar(int y) requires (x, true);
+  // expected-error@-1{{'this' cannot be implicitly used in a static member function declaration}}
+  static void baz(int y) requires (this, true);
+  // expected-error@-1{{'this' cannot be used in a static member function declaration}}
+};
+
+auto lambda1 = [] (auto x) requires (sizeof(decltype(x)) == 1) { };
+
+auto lambda2 = [] (auto x) constexpr -> int requires (sizeof(decltype(x)) == 1) { return 0; };
+
+auto lambda3 = [] requires (sizeof(char) == 1) { };
+// expected-error@-1{{lambda requires '()' before 'requires' clause}}
\ No newline at end of file
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 4d8c6e5c46b63..a03725889360e 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -6551,10 +6551,11 @@
 // PPC32-LINUX-NOT: _CALL_LINUX
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-linux-gnu -target-feature +spe < /dev/null | FileCheck -match-full-lines -check-prefix PPC32-SPE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpcspe-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC32-SPE %s
 //
 // PPC32-SPE:#define __NO_FPRS__ 1
 // PPC32-SPE:#define __SPE__ 1
-// 
+//
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-linux-gnu -target-cpu 8548 < /dev/null | FileCheck -match-full-lines -check-prefix PPC8548 %s
 //
 // PPC8548:#define __NO_FPRS__ 1
diff --git a/clang/test/Sema/arm-mve-immediates.c b/clang/test/Sema/arm-mve-immediates.c
index 54cdb96efcd3b..b8106fbb70282 100644
--- a/clang/test/Sema/arm-mve-immediates.c
+++ b/clang/test/Sema/arm-mve-immediates.c
@@ -110,3 +110,96 @@ void test_lane_indices(uint8x16_t v16, uint16x8_t v8,
   vsetq_lane_u64(23, v2, 1);
   vsetq_lane_u64(23, v2, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
 }
+
+void test_immediate_shifts(uint8x16_t vb, uint16x8_t vh, uint32x4_t vw)
+{
+  vshlq_n(vb, 0);
+  vshlq_n(vb, 7);
+  vshlq_n(vh, 0);
+  vshlq_n(vh, 15);
+  vshlq_n(vw, 0);
+  vshlq_n(vw, 31);
+
+  vshlq_n(vb, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  vshlq_n(vb, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  vshlq_n(vh, -1); // expected-error {{argument value -1 is outside the valid range [0, 15]}}
+  vshlq_n(vh, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  vshlq_n(vw, -1); // expected-error {{argument value -1 is outside the valid range [0, 31]}}
+  vshlq_n(vw, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+
+  vqshlq_n(vb, 0);
+  vqshlq_n(vb, 7);
+  vqshlq_n(vh, 0);
+  vqshlq_n(vh, 15);
+  vqshlq_n(vw, 0);
+  vqshlq_n(vw, 31);
+
+  vqshlq_n(vb, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  vqshlq_n(vb, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  vqshlq_n(vh, -1); // expected-error {{argument value -1 is outside the valid range [0, 15]}}
+  vqshlq_n(vh, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  vqshlq_n(vw, -1); // expected-error {{argument value -1 is outside the valid range [0, 31]}}
+  vqshlq_n(vw, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+
+  vsliq(vb, vb, 0);
+  vsliq(vb, vb, 7);
+  vsliq(vh, vh, 0);
+  vsliq(vh, vh, 15);
+  vsliq(vw, vw, 0);
+  vsliq(vw, vw, 31);
+
+  vsliq(vb, vb, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  vsliq(vb, vb, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  vsliq(vh, vh, -1); // expected-error {{argument value -1 is outside the valid range [0, 15]}}
+  vsliq(vh, vh, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  vsliq(vw, vw, -1); // expected-error {{argument value -1 is outside the valid range [0, 31]}}
+  vsliq(vw, vw, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+
+  vshllbq(vb, 1);
+  vshllbq(vb, 8);
+  vshllbq(vh, 1);
+  vshllbq(vh, 16);
+
+  vshllbq(vb, 0); // expected-error {{argument value 0 is outside the valid range [1, 8]}}
+  vshllbq(vb, 9); // expected-error {{argument value 9 is outside the valid range [1, 8]}}
+  vshllbq(vh, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}}
+  vshllbq(vh, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}}
+
+  vshrq(vb, 1);
+  vshrq(vb, 8);
+  vshrq(vh, 1);
+  vshrq(vh, 16);
+  vshrq(vw, 1);
+  vshrq(vw, 32);
+
+  vshrq(vb, 0); // expected-error {{argument value 0 is outside the valid range [1, 8]}}
+  vshrq(vb, 9); // expected-error {{argument value 9 is outside the valid range [1, 8]}}
+  vshrq(vh, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}}
+  vshrq(vh, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}}
+  vshrq(vw, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}}
+  vshrq(vw, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}}
+
+  vshrntq(vb, vh, 1);
+  vshrntq(vb, vh, 8);
+  vshrntq(vh, vw, 1);
+  vshrntq(vh, vw, 16);
+
+  vshrntq(vb, vh, 0); // expected-error {{argument value 0 is outside the valid range [1, 8]}}
+  vshrntq(vb, vh, 9); // expected-error {{argument value 9 is outside the valid range [1, 8]}}
+  vshrntq(vh, vw, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}}
+  vshrntq(vh, vw, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}}
+
+  vsriq(vb, vb, 1);
+  vsriq(vb, vb, 8);
+  vsriq(vh, vh, 1);
+  vsriq(vh, vh, 16);
+  vsriq(vw, vw, 1);
+  vsriq(vw, vw, 32);
+
+  vsriq(vb, vb, 0); // expected-error {{argument value 0 is outside the valid range [1, 8]}}
+  vsriq(vb, vb, 9); // expected-error {{argument value 9 is outside the valid range [1, 8]}}
+  vsriq(vh, vh, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}}
+  vsriq(vh, vh, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}}
+  vsriq(vw, vw, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}}
+  vsriq(vw, vw, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}}
+}
diff --git a/clang/test/SemaTemplate/instantiate-requires-clause.cpp b/clang/test/SemaTemplate/instantiate-requires-clause.cpp
new file mode 100644
index 0000000000000..f36396b98db73
--- /dev/null
+++ b/clang/test/SemaTemplate/instantiate-requires-clause.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ %s -verify
+
+template <typename... Args> requires ((sizeof(Args) == 1), ...)
+// expected-note@-1 {{because '(sizeof(int) == 1) , (sizeof(char) == 1) , (sizeof(int) == 1)' evaluated to false}}
+void f1(Args&&... args) { }
+// expected-note@-1 {{candidate template ignored: constraints not satisfied [with Args = <int, char, int>]}}
+
+using f11 = decltype(f1('a'));
+using f12 = decltype(f1(1, 'b'));
+using f13 = decltype(f1(1, 'b', 2));
+// expected-error@-1 {{no matching function for call to 'f1'}}
+
+template <typename... Args>
+void f2(Args&&... args) requires ((sizeof(args) == 1), ...) { }
+// expected-note@-1 {{candidate template ignored: constraints not satisfied [with Args = <int, char, int>]}}
+// expected-note@-2 {{because '(sizeof (args) == 1) , (sizeof (args) == 1) , (sizeof (args) == 1)' evaluated to false}}
+
+using f21 = decltype(f2('a'));
+using f22 = decltype(f2(1, 'b'));
+using f23 = decltype(f2(1, 'b', 2));
+// expected-error@-1 {{no matching function for call to 'f2'}}
+
+template <typename... Args> requires ((sizeof(Args) == 1), ...)
+// expected-note@-1 {{because '(sizeof(int) == 1) , (sizeof(char) == 1) , (sizeof(int) == 1)' evaluated to false}}
+void f3(Args&&... args) requires ((sizeof(args) == 1), ...) { }
+// expected-note@-1 {{candidate template ignored: constraints not satisfied [with Args = <int, char, int>]}}
+
+using f31 = decltype(f3('a'));
+using f32 = decltype(f3(1, 'b'));
+using f33 = decltype(f3(1, 'b', 2));
+// expected-error@-1 {{no matching function for call to 'f3'}}
diff --git a/compiler-rt/test/cfi/cross-dso/stats.cpp b/compiler-rt/test/cfi/cross-dso/stats.cpp
index 09a7217bf066a..9d8c2ee3e0d95 100644
--- a/compiler-rt/test/cfi/cross-dso/stats.cpp
+++ b/compiler-rt/test/cfi/cross-dso/stats.cpp
@@ -22,24 +22,24 @@ extern "C" void nvcall(A *a);
 #ifdef SHARED_LIB
 
 extern "C" __attribute__((noinline)) void vcall(A *a) {
-  // CHECK: stats.cpp:[[@LINE+1]] vcall.cfi cfi-vcall 37
+  // CHECK-DAG: stats.cpp:[[@LINE+1]] vcall.cfi cfi-vcall 37
   a->vf();
 }
 
 extern "C" __attribute__((noinline)) void nvcall(A *a) {
-  // CHECK: stats.cpp:[[@LINE+1]] nvcall.cfi cfi-nvcall 51
+  // CHECK-DAG: stats.cpp:[[@LINE+1]] nvcall.cfi cfi-nvcall 51
   a->nvf();
 }
 
 #else
 
 extern "C" __attribute__((noinline)) A *dcast(A *a) {
-  // CHECK: stats.cpp:[[@LINE+1]] dcast.cfi cfi-derived-cast 24
+  // CHECK-DAG: stats.cpp:[[@LINE+1]] dcast.cfi cfi-derived-cast 24
   return (A *)(ABase *)a;
 }
 
 extern "C" __attribute__((noinline)) A *ucast(A *a) {
-  // CHECK: stats.cpp:[[@LINE+1]] ucast.cfi cfi-unrelated-cast 81
+  // CHECK-DAG: stats.cpp:[[@LINE+1]] ucast.cfi cfi-unrelated-cast 81
   return (A *)(char *)a;
 }
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/weak_result.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/weak_result.pass.cpp
index a3577ff064275..848858d33b19f 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/weak_result.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/weak_result.pass.cpp
@@ -12,6 +12,8 @@
 
 // has weak result type
 
+// REQUIRES: c++98 || c++03 || c++11 || c++14 || c++17
+
 #include <functional>
 #include <type_traits>
 
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index 0ddcd5f971f7f..f3698e9c46e1b 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -92,6 +92,14 @@ usual symlinks pointing to that.")
 option(LIBCXXABI_ENABLE_SHARED "Build libc++abi as a shared library." ON)
 option(LIBCXXABI_ENABLE_STATIC "Build libc++abi as a static library." ON)
 
+option(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI
+       "Whether the libc++abi tests should link with the shared libc++abi library"
+       ${LIBCXXABI_ENABLE_SHARED})
+
+option(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX
+       "Whether the libc++abi tests should link with the shared libc++ library"
+       ${LIBCXX_ENABLE_SHARED})
+
 cmake_dependent_option(LIBCXXABI_INSTALL_STATIC_LIBRARY
   "Install the static libc++abi library." ON
   "LIBCXXABI_ENABLE_STATIC;LIBCXXABI_INSTALL_LIBRARY" OFF)
@@ -115,6 +123,26 @@ if (NOT LIBCXXABI_ENABLE_SHARED AND NOT LIBCXXABI_ENABLE_STATIC)
   message(FATAL_ERROR "libc++abi must be built as either a shared or static library.")
 endif()
 
+if(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI AND NOT LIBCXXABI_ENABLE_SHARED)
+  message(FATAL_ERROR "LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI being ON requires LIBCXXABI_ENABLE_SHARED to be ON")
+endif()
+
+if(NOT LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI AND NOT LIBCXXABI_ENABLE_STATIC)
+  message(FATAL_ERROR "LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI being OFF requires LIBCXXABI_ENABLE_STATIC to be ON")
+endif()
+
+if(DEFINED LIBCXX_ENABLE_SHARED 
+   AND LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX
+   AND NOT LIBCXX_ENABLE_SHARED)
+  message(FATAL_ERROR "LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX being ON requires LIBCXX_ENABLE_SHARED to be ON")
+endif()
+
+if(DEFINED LIBCXX_ENABLE_STATIC
+   AND NOT LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX 
+   AND NOT LIBCXX_ENABLE_STATIC)
+  message(FATAL_ERROR "LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX being OFF requires LIBCXX_ENABLE_STATIC to be ON")
+endif()
+
 if (LLVM_EXTERNAL_LIBCXX_SOURCE_DIR)
   set(LIBCXXABI_LIBCXX_SRC_DIRS ${LLVM_EXTERNAL_LIBCXX_SOURCE_DIR})
 else()
@@ -209,7 +237,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LIBCXXABI_LIBRARY_DIR})
 # directory.
 if (NOT LIBCXXABI_LIBCXX_LIBRARY_PATH)
   set(LIBCXXABI_LIBCXX_LIBRARY_PATH "${LIBCXXABI_LIBRARY_DIR}" CACHE PATH
-      "The path to libc++ library.")
+      "The path to libc++ library." FORCE)
 endif()
 
 # Check that we can build with 32 bits if requested.
diff --git a/libcxxabi/test/CMakeLists.txt b/libcxxabi/test/CMakeLists.txt
index 60e052d20710c..23c0dac5b8568 100644
--- a/libcxxabi/test/CMakeLists.txt
+++ b/libcxxabi/test/CMakeLists.txt
@@ -20,6 +20,8 @@ pythonize_bool(LIBCXXABI_USE_LLVM_UNWINDER)
 pythonize_bool(LIBCXXABI_USE_COMPILER_RT)
 pythonize_bool(LIBCXXABI_BUILD_EXTERNAL_THREAD_LIBRARY)
 pythonize_bool(LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
+pythonize_bool(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX)
+pythonize_bool(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI)
 set(LIBCXXABI_TARGET_INFO "libcxx.test.target_info.LocalTI" CACHE STRING
     "TargetInfo to use when setting up test environment.")
 set(LIBCXXABI_EXECUTOR "None" CACHE STRING
diff --git a/libcxxabi/test/lit.site.cfg.in b/libcxxabi/test/lit.site.cfg.in
index 8ac2fa45cab0b..8125e89a4533c 100644
--- a/libcxxabi/test/lit.site.cfg.in
+++ b/libcxxabi/test/lit.site.cfg.in
@@ -16,8 +16,8 @@ config.sanitizer_library        = "@LIBCXXABI_SANITIZER_LIBRARY@"
 config.enable_32bit             = @LIBCXXABI_BUILD_32_BITS@
 config.target_info              = "@LIBCXXABI_TARGET_INFO@"
 config.executor                 = "@LIBCXXABI_EXECUTOR@"
-config.libcxxabi_shared         = @LIBCXXABI_ENABLE_SHARED@
-config.enable_shared            = @LIBCXX_ENABLE_SHARED@
+config.libcxxabi_shared         = @LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI@
+config.enable_shared            = @LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX@
 config.enable_exceptions        = @LIBCXXABI_ENABLE_EXCEPTIONS@
 config.host_triple              = "@LLVM_HOST_TRIPLE@"
 config.target_triple            = "@TARGET_TRIPLE@"
diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp
index d29932dd42a2e..106bc9bab5bd2 100644
--- a/lld/ELF/Arch/Hexagon.cpp
+++ b/lld/ELF/Arch/Hexagon.cpp
@@ -54,6 +54,7 @@ Hexagon::Hexagon() {
   // Hexagon Linux uses 64K pages by default.
   defaultMaxPageSize = 0x10000;
   noneRel = R_HEX_NONE;
+  tlsGotRel = R_HEX_TPREL_32;
 }
 
 uint32_t Hexagon::calcEFlags() const {
@@ -115,6 +116,11 @@ RelExpr Hexagon::getRelExpr(RelType type, const Symbol &s,
   case R_HEX_B22_PCREL_X:
   case R_HEX_B32_PCREL_X:
     return R_PLT_PC;
+  case R_HEX_IE_32_6_X:
+  case R_HEX_IE_16_X:
+  case R_HEX_IE_HI16:
+  case R_HEX_IE_LO16:
+    return R_GOT;
   case R_HEX_GOTREL_11_X:
   case R_HEX_GOTREL_16_X:
   case R_HEX_GOTREL_32_6_X:
@@ -125,6 +131,13 @@ RelExpr Hexagon::getRelExpr(RelType type, const Symbol &s,
   case R_HEX_GOT_16_X:
   case R_HEX_GOT_32_6_X:
     return R_GOTPLT;
+  case R_HEX_IE_GOT_11_X:
+  case R_HEX_IE_GOT_16_X:
+  case R_HEX_IE_GOT_32_6_X:
+  case R_HEX_IE_GOT_HI16:
+  case R_HEX_IE_GOT_LO16:
+    config->hasStaticTlsModel = true;
+    return R_GOTPLT;
   case R_HEX_TPREL_11_X:
   case R_HEX_TPREL_16:
   case R_HEX_TPREL_16_X:
@@ -227,6 +240,7 @@ void Hexagon::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
     or32le(loc, applyMask(0x00203fe0, val & 0x3f));
     break;
   case R_HEX_11_X:
+  case R_HEX_IE_GOT_11_X:
   case R_HEX_GOT_11_X:
   case R_HEX_GOTREL_11_X:
   case R_HEX_TPREL_11_X:
@@ -236,6 +250,8 @@ void Hexagon::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
     or32le(loc, applyMask(0x000007e0, val));
     break;
   case R_HEX_16_X: // These relocs only have 6 effective bits.
+  case R_HEX_IE_16_X:
+  case R_HEX_IE_GOT_16_X:
   case R_HEX_GOT_16_X:
   case R_HEX_GOTREL_16_X:
   case R_HEX_TPREL_16_X:
@@ -251,6 +267,8 @@ void Hexagon::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
   case R_HEX_32_6_X:
   case R_HEX_GOT_32_6_X:
   case R_HEX_GOTREL_32_6_X:
+  case R_HEX_IE_GOT_32_6_X:
+  case R_HEX_IE_32_6_X:
   case R_HEX_TPREL_32_6_X:
     or32le(loc, applyMask(0x0fff3fff, val >> 6));
     break;
@@ -285,11 +303,15 @@ void Hexagon::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
     break;
   case R_HEX_GOTREL_HI16:
   case R_HEX_HI16:
+  case R_HEX_IE_GOT_HI16:
+  case R_HEX_IE_HI16:
   case R_HEX_TPREL_HI16:
     or32le(loc, applyMask(0x00c03fff, val >> 16));
     break;
   case R_HEX_GOTREL_LO16:
   case R_HEX_LO16:
+  case R_HEX_IE_GOT_LO16:
+  case R_HEX_IE_LO16:
   case R_HEX_TPREL_LO16:
     or32le(loc, applyMask(0x00c03fff, val));
     break;
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 8025779d18dc1..1df8a157cfe5e 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -177,7 +177,9 @@ handleTlsRelocation(RelType type, Symbol &sym, InputSectionBase &c,
     return 1;
   }
 
-  bool canRelax = config->emachine != EM_ARM && config->emachine != EM_RISCV;
+  bool canRelax = config->emachine != EM_ARM &&
+                  config->emachine != EM_HEXAGON &&
+                  config->emachine != EM_RISCV;
 
   // If we are producing an executable and the symbol is non-preemptable, it
   // must be defined and the code sequence can be relaxed to use Local-Exec.
diff --git a/lld/test/ELF/hexagon-tls-ie.s b/lld/test/ELF/hexagon-tls-ie.s
new file mode 100644
index 0000000000000..ea05279473116
--- /dev/null
+++ b/lld/test/ELF/hexagon-tls-ie.s
@@ -0,0 +1,78 @@
+# REQUIRES: hexagon
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
+# RUN: llvm-readobj -r %t.o | FileCheck -check-prefix=RELOC %s
+# RUN: ld.lld %t.o -o %t
+## shared needs -z notext because of the R_HEX_IE_16/32_X(R_GOT) static
+## relocations
+# RUN: ld.lld -z notext -shared %t.o -o %t.so
+# RUN: llvm-objdump -d --no-show-raw-insn --print-imm-hex %t | FileCheck %s
+# RUN:  llvm-readobj -x .got %t | FileCheck -check-prefix=GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn --print-imm-hex %t.so | \
+# RUN: FileCheck -check-prefix=SHARED %s
+# RUN: llvm-readobj -r  %t.so | FileCheck -check-prefix=RELA %s
+
+	.globl	_start
+	.type	_start, @function
+_start:
+
+# RELOC:      0x0 R_HEX_IE_32_6_X a 0x0
+# RELOC-NEXT: 0x4 R_HEX_IE_16_X a 0x0
+# CHECK:      {   immext(#0x30180)
+# CHECK-NEXT:     r2 = memw(##0x301a4) }
+                  r2 = memw(##a@IE)
+
+# RELOC-NEXT: 0x8 R_HEX_IE_LO16 a 0x0
+# CHECK: {       r2.l = #0x1a4 }
+                 r2.l = #a@IE
+# RELOC-NEXT: 0xC R_HEX_IE_HI16 a 0x0
+# CHECK: {       r2.h = #0x3 }
+                 r2.h = #a@IE
+
+
+# GOT: Hex dump of section '.got':
+# GOT-NEXT: 0x000301a4 f0ffffff f4ffffff f8ffffff fcffffff
+                 r2 = memw(##a@IE)
+                 r2 = memw(##b@IE)
+                 r2 = memw(##c@IE)
+                 r2 = memw(##d@IE)
+
+# RELOC:      0x30 R_HEX_IE_GOT_32_6_X a 0x0
+# RELOC-NEXT: 0x34 R_HEX_IE_GOT_16_X a 0x0
+# SHARED:      { immext(#0xfffeffc0)
+# SHARED-NEXT:   r2 = memw(##0xfffefff0) }
+                 r2 = memw(##a@IEGOT)
+
+# RELOC-NEXT: 0x38 R_HEX_IE_GOT_LO16 a 0x0
+# SHARED: {     r2.l = #0xfff0 }
+                r2.l = #a@IEGOT
+# RELOC-NEXT: 0x3C R_HEX_IE_GOT_HI16 a 0x0
+# SHARED: {     r2.h = #0xfffe }
+                r2.h = #a@IEGOT
+
+# RELOC:      0x44 R_HEX_IE_GOT_11_X a 0x0
+# SHARED:    {  immext(#0xfffeffc0)
+# SHARED-NEXT:  r0 = !cmp.eq(r1,##-0x10010) }
+                r0=!cmp.eq(r1,##a@iegot)
+
+# RELA:       0x203C4 R_HEX_TPREL_32 a 0x0
+# RELA-NEXT:  0x203C8 R_HEX_TPREL_32 b 0x0
+# RELA-NEXT:  0x203CC R_HEX_TPREL_32 c 0x0
+# RELA-NEXT:  0x203D0 R_HEX_TPREL_32 d 0x0
+                r2 = memw(##b@IEGOT)
+                r2 = memw(##c@IEGOT)
+                r2 = memw(##d@IEGOT)
+
+
+.section        .tdata,"awT",@progbits
+.globl  a
+a:
+.word 1
+.globl  b
+b:
+.word 2
+.globl  c
+c:
+.word 3
+.globl  d
+d:
+.word 4
diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index 6170ab625c54d..573b8556989e4 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -56,7 +56,7 @@ if (LLDB_ENABLE_PYTHON)
 endif ()
 
 if (LLDB_ENABLE_PYTHON OR LLDB_ENABLE_LUA)
-  add_subdirectory(scripts)
+  add_subdirectory(bindings)
 endif ()
 
 # We need the headers generated by instrinsics_gen before we can compile
@@ -97,7 +97,7 @@ if(LLDB_INCLUDE_TESTS)
 endif()
 
 if (LLDB_ENABLE_PYTHON)
-  get_target_property(lldb_scripts_dir swig_wrapper BINARY_DIR)
+  get_target_property(lldb_bindings_dir swig_wrapper BINARY_DIR)
 
   if(LLDB_BUILD_FRAMEWORK)
     set(lldb_python_build_path "${LLDB_FRAMEWORK_ABSOLUTE_BUILD_DIR}/LLDB.framework/Resources/Python/lldb")
@@ -109,7 +109,7 @@ if (LLDB_ENABLE_PYTHON)
   # to liblldb.so for the Python API(hardlink on Windows).
   add_custom_target(finish_swig ALL VERBATIM
     COMMAND ${CMAKE_COMMAND} -E make_directory ${lldb_python_build_path}
-    DEPENDS ${lldb_scripts_dir}/lldb.py
+    DEPENDS ${lldb_bindings_dir}/lldb.py
     COMMENT "Python script sym-linking LLDB Python API")
 
   if(NOT LLDB_USE_SYSTEM_SIX)
@@ -121,7 +121,7 @@ if (LLDB_ENABLE_PYTHON)
 
   add_custom_command(TARGET finish_swig POST_BUILD VERBATIM
     COMMAND ${CMAKE_COMMAND} -E copy
-      "${lldb_scripts_dir}/lldb.py"
+      "${lldb_bindings_dir}/lldb.py"
       "${lldb_python_build_path}/__init__.py")
 
   function(create_python_package pkg_dir)
@@ -131,7 +131,7 @@ if (LLDB_ENABLE_PYTHON)
     endif()
     if(NOT ARG_NOINIT)
       set(init_cmd COMMAND ${PYTHON_EXECUTABLE}
-          "${LLDB_SOURCE_DIR}/scripts/Python/createPythonInit.py"
+          "${LLDB_SOURCE_DIR}/bindings/python/createPythonInit.py"
           "${pkg_dir}" ${ARG_FILES})
     endif()
     add_custom_command(TARGET finish_swig POST_BUILD VERBATIM
diff --git a/lldb/scripts/CMakeLists.txt b/lldb/bindings/CMakeLists.txt
similarity index 93%
rename from lldb/scripts/CMakeLists.txt
rename to lldb/bindings/CMakeLists.txt
index 515c63293bc20..92ae402c478e9 100644
--- a/lldb/scripts/CMakeLists.txt
+++ b/lldb/bindings/CMakeLists.txt
@@ -1,4 +1,4 @@
-file(GLOB SWIG_INTERFACES interface/*.i)
+file(GLOB SWIG_INTERFACES interfaces/*.i)
 file(GLOB_RECURSE SWIG_SOURCES *.swig)
 file(GLOB SWIG_HEADERS
   ${LLDB_SOURCE_DIR}/include/lldb/API/*.h
@@ -46,7 +46,7 @@ if (LLDB_ENABLE_PYTHON)
         -python
         -threads
         -o ${CMAKE_CURRENT_BINARY_DIR}/LLDBWrapPython.cpp
-        ${LLDB_SOURCE_DIR}/scripts/lldb.swig
+        ${LLDB_SOURCE_DIR}/bindings/python.swig
     VERBATIM
     COMMENT "Builds LLDB Python wrapper")
 
@@ -67,7 +67,7 @@ if (LLDB_ENABLE_LUA)
         -lua
         -w503
         -o ${CMAKE_CURRENT_BINARY_DIR}/LLDBWrapLua.cpp
-        ${LLDB_SOURCE_DIR}/scripts/lldb_lua.swig
+        ${LLDB_SOURCE_DIR}/bindings/lua.swig
     VERBATIM
     COMMENT "Builds LLDB Lua wrapper")
 
diff --git a/lldb/scripts/headers.swig b/lldb/bindings/headers.swig
similarity index 100%
rename from lldb/scripts/headers.swig
rename to lldb/bindings/headers.swig
diff --git a/lldb/scripts/interface/SBAddress.i b/lldb/bindings/interface/SBAddress.i
similarity index 99%
rename from lldb/scripts/interface/SBAddress.i
rename to lldb/bindings/interface/SBAddress.i
index 6c5352bac6d7c..4658534d153ea 100644
--- a/lldb/scripts/interface/SBAddress.i
+++ b/lldb/bindings/interface/SBAddress.i
@@ -140,6 +140,8 @@ public:
     lldb::SBLineEntry
     GetLineEntry ();
 
+    STRING_EXTENSION(SBAddress)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __get_load_addr_property__ (self):
diff --git a/lldb/scripts/interface/SBAttachInfo.i b/lldb/bindings/interface/SBAttachInfo.i
similarity index 100%
rename from lldb/scripts/interface/SBAttachInfo.i
rename to lldb/bindings/interface/SBAttachInfo.i
diff --git a/lldb/scripts/interface/SBBlock.i b/lldb/bindings/interface/SBBlock.i
similarity index 99%
rename from lldb/scripts/interface/SBBlock.i
rename to lldb/bindings/interface/SBBlock.i
index 73079a11760c6..8bd8e37953cfc 100644
--- a/lldb/scripts/interface/SBBlock.i
+++ b/lldb/bindings/interface/SBBlock.i
@@ -100,6 +100,8 @@ public:
                    bool locals,
                    bool statics);
 
+    STRING_EXTENSION(SBBlock)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def get_range_at_index(self, idx):
diff --git a/lldb/scripts/interface/SBBreakpoint.i b/lldb/bindings/interface/SBBreakpoint.i
similarity index 99%
rename from lldb/scripts/interface/SBBreakpoint.i
rename to lldb/bindings/interface/SBBreakpoint.i
index f84f2ada3d329..20354346be900 100644
--- a/lldb/scripts/interface/SBBreakpoint.i
+++ b/lldb/bindings/interface/SBBreakpoint.i
@@ -249,6 +249,8 @@ public:
     bool
     IsHardware ();
 
+    STRING_EXTENSION(SBBreakpoint)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
 
diff --git a/lldb/scripts/interface/SBBreakpointLocation.i b/lldb/bindings/interface/SBBreakpointLocation.i
similarity index 97%
rename from lldb/scripts/interface/SBBreakpointLocation.i
rename to lldb/bindings/interface/SBBreakpointLocation.i
index 44fd42b514f7f..dc39c83c2d67b 100644
--- a/lldb/scripts/interface/SBBreakpointLocation.i
+++ b/lldb/bindings/interface/SBBreakpointLocation.i
@@ -134,6 +134,8 @@ public:
 
     SBBreakpoint
     GetBreakpoint ();
+
+    STRING_EXTENSION_LEVEL(SBBreakpointLocation, lldb::eDescriptionLevelFull)
 };
 
 } // namespace lldb
diff --git a/lldb/scripts/interface/SBBreakpointName.i b/lldb/bindings/interface/SBBreakpointName.i
similarity index 98%
rename from lldb/scripts/interface/SBBreakpointName.i
rename to lldb/bindings/interface/SBBreakpointName.i
index 2a06d0a2105f0..e280d42245915 100644
--- a/lldb/scripts/interface/SBBreakpointName.i
+++ b/lldb/bindings/interface/SBBreakpointName.i
@@ -108,6 +108,7 @@ public:
 
   bool GetDescription(lldb::SBStream &description);
 
+  STRING_EXTENSION(SBBreakpointName)
 };
 
 } // namespace lldb
diff --git a/lldb/scripts/interface/SBBroadcaster.i b/lldb/bindings/interface/SBBroadcaster.i
similarity index 100%
rename from lldb/scripts/interface/SBBroadcaster.i
rename to lldb/bindings/interface/SBBroadcaster.i
diff --git a/lldb/scripts/interface/SBCommandInterpreter.i b/lldb/bindings/interface/SBCommandInterpreter.i
similarity index 100%
rename from lldb/scripts/interface/SBCommandInterpreter.i
rename to lldb/bindings/interface/SBCommandInterpreter.i
diff --git a/lldb/scripts/interface/SBCommandReturnObject.i b/lldb/bindings/interface/SBCommandReturnObject.i
similarity index 98%
rename from lldb/scripts/interface/SBCommandReturnObject.i
rename to lldb/bindings/interface/SBCommandReturnObject.i
index 73d4001aaba59..affa16520f28d 100644
--- a/lldb/scripts/interface/SBCommandReturnObject.i
+++ b/lldb/bindings/interface/SBCommandReturnObject.i
@@ -96,6 +96,8 @@ public:
     void SetImmediateOutputFile(lldb::FileSP BORROWED);
     void SetImmediateErrorFile(lldb::FileSP BORROWED);
 
+    STRING_EXTENSION(SBCommandReturnObject)
+
     %extend {
         // transfer_ownership does nothing, and is here for compatibility with
         // old scripts.  Ownership is tracked by reference count in the ordinary way.
diff --git a/lldb/scripts/interface/SBCommunication.i b/lldb/bindings/interface/SBCommunication.i
similarity index 100%
rename from lldb/scripts/interface/SBCommunication.i
rename to lldb/bindings/interface/SBCommunication.i
diff --git a/lldb/scripts/interface/SBCompileUnit.i b/lldb/bindings/interface/SBCompileUnit.i
similarity index 99%
rename from lldb/scripts/interface/SBCompileUnit.i
rename to lldb/bindings/interface/SBCompileUnit.i
index bc2d45ae8e56c..d6a4c07038c65 100644
--- a/lldb/scripts/interface/SBCompileUnit.i
+++ b/lldb/bindings/interface/SBCompileUnit.i
@@ -116,6 +116,8 @@ public:
     bool
     operator != (const lldb::SBCompileUnit &rhs) const;
 
+    STRING_EXTENSION(SBCompileUnit)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBData.i b/lldb/bindings/interface/SBData.i
similarity index 99%
rename from lldb/scripts/interface/SBData.i
rename to lldb/bindings/interface/SBData.i
index fdaa6962f0eca..3e74240329e05 100644
--- a/lldb/scripts/interface/SBData.i
+++ b/lldb/bindings/interface/SBData.i
@@ -134,6 +134,8 @@ public:
     bool
     SetDataFromDoubleArray (double* array, size_t array_len);
 
+    STRING_EXTENSION(SBData)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
 
diff --git a/lldb/scripts/interface/SBDebugger.i b/lldb/bindings/interface/SBDebugger.i
similarity index 99%
rename from lldb/scripts/interface/SBDebugger.i
rename to lldb/bindings/interface/SBDebugger.i
index 52f65841893c6..f2e23a7ed7804 100644
--- a/lldb/scripts/interface/SBDebugger.i
+++ b/lldb/bindings/interface/SBDebugger.i
@@ -479,6 +479,8 @@ public:
     lldb::SBTypeSynthetic
     GetSyntheticForType (lldb::SBTypeNameSpecifier);
 
+    STRING_EXTENSION(SBDebugger)
+
     %feature("docstring",
 "Launch a command interpreter session. Commands are read from standard input or
 from the input handle specified for the debugger object. Output/errors are
diff --git a/lldb/scripts/interface/SBDeclaration.i b/lldb/bindings/interface/SBDeclaration.i
similarity index 97%
rename from lldb/scripts/interface/SBDeclaration.i
rename to lldb/bindings/interface/SBDeclaration.i
index cdaec85676461..621c1a0ab7c87 100644
--- a/lldb/scripts/interface/SBDeclaration.i
+++ b/lldb/bindings/interface/SBDeclaration.i
@@ -53,6 +53,8 @@ namespace lldb {
         bool
         operator != (const lldb::SBDeclaration &rhs) const;
 
+        STRING_EXTENSION(SBDeclaration)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             file = property(GetFileSpec, None, doc='''A read only property that returns an lldb object that represents the file (lldb.SBFileSpec) for this line entry.''')
diff --git a/lldb/scripts/interface/SBError.i b/lldb/bindings/interface/SBError.i
similarity index 99%
rename from lldb/scripts/interface/SBError.i
rename to lldb/bindings/interface/SBError.i
index 96cd6c4886f5f..ea48e2263a77a 100644
--- a/lldb/scripts/interface/SBError.i
+++ b/lldb/bindings/interface/SBError.i
@@ -105,6 +105,8 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBError)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         value = property(GetError, None, doc='''A read only property that returns the same result as GetError().''')
diff --git a/lldb/scripts/interface/SBEvent.i b/lldb/bindings/interface/SBEvent.i
similarity index 100%
rename from lldb/scripts/interface/SBEvent.i
rename to lldb/bindings/interface/SBEvent.i
diff --git a/lldb/scripts/interface/SBExecutionContext.i b/lldb/bindings/interface/SBExecutionContext.i
similarity index 100%
rename from lldb/scripts/interface/SBExecutionContext.i
rename to lldb/bindings/interface/SBExecutionContext.i
diff --git a/lldb/scripts/interface/SBExpressionOptions.i b/lldb/bindings/interface/SBExpressionOptions.i
similarity index 100%
rename from lldb/scripts/interface/SBExpressionOptions.i
rename to lldb/bindings/interface/SBExpressionOptions.i
diff --git a/lldb/scripts/interface/SBFile.i b/lldb/bindings/interface/SBFile.i
similarity index 100%
rename from lldb/scripts/interface/SBFile.i
rename to lldb/bindings/interface/SBFile.i
diff --git a/lldb/scripts/interface/SBFileSpec.i b/lldb/bindings/interface/SBFileSpec.i
similarity index 98%
rename from lldb/scripts/interface/SBFileSpec.i
rename to lldb/bindings/interface/SBFileSpec.i
index 07a7630ebbac8..d287a940c051a 100644
--- a/lldb/scripts/interface/SBFileSpec.i
+++ b/lldb/bindings/interface/SBFileSpec.i
@@ -80,6 +80,8 @@ public:
     void
     AppendPathComponent (const char *file_or_directory);
 
+    STRING_EXTENSION(SBFileSpec)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __get_fullpath__(self):
diff --git a/lldb/scripts/interface/SBFileSpecList.i b/lldb/bindings/interface/SBFileSpecList.i
similarity index 100%
rename from lldb/scripts/interface/SBFileSpecList.i
rename to lldb/bindings/interface/SBFileSpecList.i
diff --git a/lldb/scripts/interface/SBFrame.i b/lldb/bindings/interface/SBFrame.i
similarity index 99%
rename from lldb/scripts/interface/SBFrame.i
rename to lldb/bindings/interface/SBFrame.i
index 811f7f22f9b4d..c65b88f863e7d 100644
--- a/lldb/scripts/interface/SBFrame.i
+++ b/lldb/bindings/interface/SBFrame.i
@@ -285,6 +285,8 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBFrame)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def get_all_variables(self):
diff --git a/lldb/scripts/interface/SBFunction.i b/lldb/bindings/interface/SBFunction.i
similarity index 99%
rename from lldb/scripts/interface/SBFunction.i
rename to lldb/bindings/interface/SBFunction.i
index 7b157bb388169..630c4db22c55d 100644
--- a/lldb/scripts/interface/SBFunction.i
+++ b/lldb/bindings/interface/SBFunction.i
@@ -111,6 +111,8 @@ public:
     bool
     operator != (const lldb::SBFunction &rhs) const;
 
+    STRING_EXTENSION(SBFunction)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def get_instructions_from_current_target (self):
diff --git a/lldb/scripts/interface/SBHostOS.i b/lldb/bindings/interface/SBHostOS.i
similarity index 100%
rename from lldb/scripts/interface/SBHostOS.i
rename to lldb/bindings/interface/SBHostOS.i
diff --git a/lldb/scripts/interface/SBInstruction.i b/lldb/bindings/interface/SBInstruction.i
similarity index 98%
rename from lldb/scripts/interface/SBInstruction.i
rename to lldb/bindings/interface/SBInstruction.i
index 09688214630b1..d50a080fd0454 100644
--- a/lldb/scripts/interface/SBInstruction.i
+++ b/lldb/bindings/interface/SBInstruction.i
@@ -74,6 +74,8 @@ public:
     bool
     TestEmulation (lldb::SBStream &output_stream, const char *test_file);
 
+    STRING_EXTENSION(SBInstruction)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __mnemonic_property__ (self):
diff --git a/lldb/scripts/interface/SBInstructionList.i b/lldb/bindings/interface/SBInstructionList.i
similarity index 98%
rename from lldb/scripts/interface/SBInstructionList.i
rename to lldb/bindings/interface/SBInstructionList.i
index d50deba4f5e1f..1357323027573 100644
--- a/lldb/scripts/interface/SBInstructionList.i
+++ b/lldb/bindings/interface/SBInstructionList.i
@@ -66,6 +66,8 @@ public:
     bool
     DumpEmulationForAllInstructions (const char *triple);
 
+    STRING_EXTENSION(SBInstructionList)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBLanguageRuntime.i b/lldb/bindings/interface/SBLanguageRuntime.i
similarity index 100%
rename from lldb/scripts/interface/SBLanguageRuntime.i
rename to lldb/bindings/interface/SBLanguageRuntime.i
diff --git a/lldb/scripts/interface/SBLaunchInfo.i b/lldb/bindings/interface/SBLaunchInfo.i
similarity index 100%
rename from lldb/scripts/interface/SBLaunchInfo.i
rename to lldb/bindings/interface/SBLaunchInfo.i
diff --git a/lldb/scripts/interface/SBLineEntry.i b/lldb/bindings/interface/SBLineEntry.i
similarity index 98%
rename from lldb/scripts/interface/SBLineEntry.i
rename to lldb/bindings/interface/SBLineEntry.i
index 90f60df23247d..be365377ba8b0 100644
--- a/lldb/scripts/interface/SBLineEntry.i
+++ b/lldb/bindings/interface/SBLineEntry.i
@@ -84,6 +84,8 @@ public:
     bool
     operator != (const lldb::SBLineEntry &rhs) const;
 
+    STRING_EXTENSION(SBLineEntry)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         file = property(GetFileSpec, None, doc='''A read only property that returns an lldb object that represents the file (lldb.SBFileSpec) for this line entry.''')
diff --git a/lldb/scripts/interface/SBListener.i b/lldb/bindings/interface/SBListener.i
similarity index 100%
rename from lldb/scripts/interface/SBListener.i
rename to lldb/bindings/interface/SBListener.i
diff --git a/lldb/scripts/interface/SBMemoryRegionInfo.i b/lldb/bindings/interface/SBMemoryRegionInfo.i
similarity index 96%
rename from lldb/scripts/interface/SBMemoryRegionInfo.i
rename to lldb/bindings/interface/SBMemoryRegionInfo.i
index 7a59d0051ceac..6a2ad6a3e3649 100644
--- a/lldb/scripts/interface/SBMemoryRegionInfo.i
+++ b/lldb/bindings/interface/SBMemoryRegionInfo.i
@@ -55,6 +55,7 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBMemoryRegionInfo)
 };
 
 } // namespace lldb
diff --git a/lldb/scripts/interface/SBMemoryRegionInfoList.i b/lldb/bindings/interface/SBMemoryRegionInfoList.i
similarity index 100%
rename from lldb/scripts/interface/SBMemoryRegionInfoList.i
rename to lldb/bindings/interface/SBMemoryRegionInfoList.i
diff --git a/lldb/scripts/interface/SBModule.i b/lldb/bindings/interface/SBModule.i
similarity index 99%
rename from lldb/scripts/interface/SBModule.i
rename to lldb/bindings/interface/SBModule.i
index 03c8aeb2bed9e..a9d9480cd7cf1 100644
--- a/lldb/scripts/interface/SBModule.i
+++ b/lldb/bindings/interface/SBModule.i
@@ -344,6 +344,8 @@ public:
     lldb::SBAddress
     GetObjectFileEntryPointAddress() const;
 
+    STRING_EXTENSION(SBModule)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __len__(self):
diff --git a/lldb/scripts/interface/SBModuleSpec.i b/lldb/bindings/interface/SBModuleSpec.i
similarity index 97%
rename from lldb/scripts/interface/SBModuleSpec.i
rename to lldb/bindings/interface/SBModuleSpec.i
index ec4e9bb7fbf72..64d0aa641a774 100644
--- a/lldb/scripts/interface/SBModuleSpec.i
+++ b/lldb/bindings/interface/SBModuleSpec.i
@@ -91,6 +91,7 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBModuleSpec)
 };
 
 
@@ -127,6 +128,7 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBModuleSpecList)
 };
 
 } // namespace lldb
diff --git a/lldb/scripts/interface/SBPlatform.i b/lldb/bindings/interface/SBPlatform.i
similarity index 100%
rename from lldb/scripts/interface/SBPlatform.i
rename to lldb/bindings/interface/SBPlatform.i
diff --git a/lldb/scripts/interface/SBProcess.i b/lldb/bindings/interface/SBProcess.i
similarity index 99%
rename from lldb/scripts/interface/SBProcess.i
rename to lldb/bindings/interface/SBProcess.i
index c5ebc24686155..ac6a265faec9f 100644
--- a/lldb/scripts/interface/SBProcess.i
+++ b/lldb/bindings/interface/SBProcess.i
@@ -417,6 +417,8 @@ public:
     lldb::SBProcessInfo
     GetProcessInfo();
 
+    STRING_EXTENSION(SBProcess)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __get_is_alive__(self):
diff --git a/lldb/scripts/interface/SBProcessInfo.i b/lldb/bindings/interface/SBProcessInfo.i
similarity index 100%
rename from lldb/scripts/interface/SBProcessInfo.i
rename to lldb/bindings/interface/SBProcessInfo.i
diff --git a/lldb/scripts/interface/SBQueue.i b/lldb/bindings/interface/SBQueue.i
similarity index 100%
rename from lldb/scripts/interface/SBQueue.i
rename to lldb/bindings/interface/SBQueue.i
diff --git a/lldb/scripts/interface/SBQueueItem.i b/lldb/bindings/interface/SBQueueItem.i
similarity index 100%
rename from lldb/scripts/interface/SBQueueItem.i
rename to lldb/bindings/interface/SBQueueItem.i
diff --git a/lldb/scripts/interface/SBSection.i b/lldb/bindings/interface/SBSection.i
similarity index 99%
rename from lldb/scripts/interface/SBSection.i
rename to lldb/bindings/interface/SBSection.i
index c1a84acc4f144..3d1c900917fd8 100644
--- a/lldb/scripts/interface/SBSection.i
+++ b/lldb/bindings/interface/SBSection.i
@@ -114,6 +114,8 @@ public:
     bool
     operator != (const lldb::SBSection &rhs);
 
+    STRING_EXTENSION(SBSection)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBSourceManager.i b/lldb/bindings/interface/SBSourceManager.i
similarity index 100%
rename from lldb/scripts/interface/SBSourceManager.i
rename to lldb/bindings/interface/SBSourceManager.i
diff --git a/lldb/scripts/interface/SBStream.i b/lldb/bindings/interface/SBStream.i
similarity index 100%
rename from lldb/scripts/interface/SBStream.i
rename to lldb/bindings/interface/SBStream.i
diff --git a/lldb/scripts/interface/SBStringList.i b/lldb/bindings/interface/SBStringList.i
similarity index 100%
rename from lldb/scripts/interface/SBStringList.i
rename to lldb/bindings/interface/SBStringList.i
diff --git a/lldb/scripts/interface/SBStructuredData.i b/lldb/bindings/interface/SBStructuredData.i
similarity index 100%
rename from lldb/scripts/interface/SBStructuredData.i
rename to lldb/bindings/interface/SBStructuredData.i
diff --git a/lldb/scripts/interface/SBSymbol.i b/lldb/bindings/interface/SBSymbol.i
similarity index 99%
rename from lldb/scripts/interface/SBSymbol.i
rename to lldb/bindings/interface/SBSymbol.i
index e5880e66d300f..4e17ab5af0fd6 100644
--- a/lldb/scripts/interface/SBSymbol.i
+++ b/lldb/bindings/interface/SBSymbol.i
@@ -72,6 +72,8 @@ public:
     bool
     operator != (const lldb::SBSymbol &rhs) const;
 
+    STRING_EXTENSION(SBSymbol)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def get_instructions_from_current_target (self):
diff --git a/lldb/scripts/interface/SBSymbolContext.i b/lldb/bindings/interface/SBSymbolContext.i
similarity index 99%
rename from lldb/scripts/interface/SBSymbolContext.i
rename to lldb/bindings/interface/SBSymbolContext.i
index a6aa4d78bfe39..b6b336516c949 100644
--- a/lldb/scripts/interface/SBSymbolContext.i
+++ b/lldb/bindings/interface/SBSymbolContext.i
@@ -81,6 +81,7 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBSymbolContext)
 
 #ifdef SWIGPYTHON
     %pythoncode %{
diff --git a/lldb/scripts/interface/SBSymbolContextList.i b/lldb/bindings/interface/SBSymbolContextList.i
similarity index 99%
rename from lldb/scripts/interface/SBSymbolContextList.i
rename to lldb/bindings/interface/SBSymbolContextList.i
index 54adc659fa212..f5adcfcebfb56 100644
--- a/lldb/scripts/interface/SBSymbolContextList.i
+++ b/lldb/bindings/interface/SBSymbolContextList.i
@@ -60,6 +60,8 @@ public:
     void
     Clear();
 
+    STRING_EXTENSION(SBSymbolContextList)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBTarget.i b/lldb/bindings/interface/SBTarget.i
similarity index 98%
rename from lldb/scripts/interface/SBTarget.i
rename to lldb/bindings/interface/SBTarget.i
index 02c70b6e1cd6f..371bf5c35ebd0 100644
--- a/lldb/scripts/interface/SBTarget.i
+++ b/lldb/bindings/interface/SBTarget.i
@@ -967,21 +967,7 @@ public:
     lldb::SBValue
     EvaluateExpression (const char *expr, const lldb::SBExpressionOptions &options);
 
-  %extend {
-    %nothreadallow;
-    std::string lldb::SBTarget::__str__(){
-      lldb::SBStream stream;
-      $self->GetDescription (stream, lldb::eDescriptionLevelBrief);
-
-      const char *desc = stream.GetData();
-      size_t desc_len = stream.GetSize();
-      if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-        --desc_len;
-
-      return std::string(desc, desc_len);
-    }
-    %clearnothreadallow;
-  }
+    STRING_EXTENSION_LEVEL(SBTarget, lldb::eDescriptionLevelBrief)
 
 #ifdef SWIGPYTHON
     %pythoncode %{
diff --git a/lldb/scripts/interface/SBThread.i b/lldb/bindings/interface/SBThread.i
similarity index 99%
rename from lldb/scripts/interface/SBThread.i
rename to lldb/bindings/interface/SBThread.i
index c1c045487fc1d..95b15b182ec26 100644
--- a/lldb/scripts/interface/SBThread.i
+++ b/lldb/bindings/interface/SBThread.i
@@ -402,6 +402,8 @@ public:
     bool
     SafeToCallFunctions ();
 
+    STRING_EXTENSION(SBThread)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBThreadCollection.i b/lldb/bindings/interface/SBThreadCollection.i
similarity index 100%
rename from lldb/scripts/interface/SBThreadCollection.i
rename to lldb/bindings/interface/SBThreadCollection.i
diff --git a/lldb/scripts/interface/SBThreadPlan.i b/lldb/bindings/interface/SBThreadPlan.i
similarity index 100%
rename from lldb/scripts/interface/SBThreadPlan.i
rename to lldb/bindings/interface/SBThreadPlan.i
diff --git a/lldb/scripts/interface/SBTrace.i b/lldb/bindings/interface/SBTrace.i
similarity index 100%
rename from lldb/scripts/interface/SBTrace.i
rename to lldb/bindings/interface/SBTrace.i
diff --git a/lldb/scripts/interface/SBTraceOptions.i b/lldb/bindings/interface/SBTraceOptions.i
similarity index 100%
rename from lldb/scripts/interface/SBTraceOptions.i
rename to lldb/bindings/interface/SBTraceOptions.i
diff --git a/lldb/scripts/interface/SBType.i b/lldb/bindings/interface/SBType.i
similarity index 98%
rename from lldb/scripts/interface/SBType.i
rename to lldb/bindings/interface/SBType.i
index d9da9e39b9560..3cd82452084b4 100644
--- a/lldb/scripts/interface/SBType.i
+++ b/lldb/bindings/interface/SBType.i
@@ -43,6 +43,8 @@ public:
     uint32_t
     GetBitfieldSizeInBits();
 
+    STRING_EXTENSION_LEVEL(SBTypeMember, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         name = property(GetName, None, doc='''A read only property that returns the name for this member as a string.''')
@@ -100,6 +102,7 @@ public:
     GetDescription (lldb::SBStream &description,
                     lldb::DescriptionLevel description_level);
 
+    STRING_EXTENSION_LEVEL(SBTypeMemberFunction, lldb::eDescriptionLevelBrief)
 protected:
     lldb::TypeMemberFunctionImplSP m_opaque_sp;
 };
@@ -314,6 +317,8 @@ public:
 
     bool operator!=(lldb::SBType &rhs);
 
+    STRING_EXTENSION_LEVEL(SBType, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def template_arg_array(self):
diff --git a/lldb/scripts/interface/SBTypeCategory.i b/lldb/bindings/interface/SBTypeCategory.i
similarity index 99%
rename from lldb/scripts/interface/SBTypeCategory.i
rename to lldb/bindings/interface/SBTypeCategory.i
index 43fe9faf70f52..b762bf8a95a36 100644
--- a/lldb/scripts/interface/SBTypeCategory.i
+++ b/lldb/bindings/interface/SBTypeCategory.i
@@ -124,6 +124,8 @@ namespace lldb {
         bool
         DeleteTypeSynthetic (lldb::SBTypeNameSpecifier);
 
+        STRING_EXTENSION_LEVEL(SBTypeCategory, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
 
diff --git a/lldb/scripts/interface/SBTypeEnumMember.i b/lldb/bindings/interface/SBTypeEnumMember.i
similarity index 97%
rename from lldb/scripts/interface/SBTypeEnumMember.i
rename to lldb/bindings/interface/SBTypeEnumMember.i
index b2d8617117823..006bdeaa8cee1 100644
--- a/lldb/scripts/interface/SBTypeEnumMember.i
+++ b/lldb/bindings/interface/SBTypeEnumMember.i
@@ -43,6 +43,7 @@ public:
     GetDescription (lldb::SBStream &description,
                     lldb::DescriptionLevel description_level);
 
+    STRING_EXTENSION_LEVEL(SBTypeEnumMember, lldb::eDescriptionLevelBrief)
 #ifdef SWIGPYTHON
     %pythoncode %{
         name = property(GetName, None, doc='''A read only property that returns the name for this enum member as a string.''')
diff --git a/lldb/scripts/interface/SBTypeFilter.i b/lldb/bindings/interface/SBTypeFilter.i
similarity index 95%
rename from lldb/scripts/interface/SBTypeFilter.i
rename to lldb/bindings/interface/SBTypeFilter.i
index 3759e0a23d418..c1d282c6d4fbe 100644
--- a/lldb/scripts/interface/SBTypeFilter.i
+++ b/lldb/bindings/interface/SBTypeFilter.i
@@ -61,6 +61,8 @@ namespace lldb {
         bool
         operator != (lldb::SBTypeFilter &rhs);
 
+        STRING_EXTENSION_LEVEL(SBTypeFilter, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             options = property(GetOptions, SetOptions)
diff --git a/lldb/scripts/interface/SBTypeFormat.i b/lldb/bindings/interface/SBTypeFormat.i
similarity index 95%
rename from lldb/scripts/interface/SBTypeFormat.i
rename to lldb/bindings/interface/SBTypeFormat.i
index 5efd135b73261..765a2a7bb99dc 100644
--- a/lldb/scripts/interface/SBTypeFormat.i
+++ b/lldb/bindings/interface/SBTypeFormat.i
@@ -61,6 +61,8 @@ namespace lldb {
         bool
         operator != (lldb::SBTypeFormat &rhs);
 
+        STRING_EXTENSION_LEVEL(SBTypeFormat, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             format = property(GetFormat, SetFormat)
diff --git a/lldb/scripts/interface/SBTypeNameSpecifier.i b/lldb/bindings/interface/SBTypeNameSpecifier.i
similarity index 94%
rename from lldb/scripts/interface/SBTypeNameSpecifier.i
rename to lldb/bindings/interface/SBTypeNameSpecifier.i
index bb16e86b0bc50..772f7c174093f 100644
--- a/lldb/scripts/interface/SBTypeNameSpecifier.i
+++ b/lldb/bindings/interface/SBTypeNameSpecifier.i
@@ -53,6 +53,8 @@ namespace lldb {
         bool
         operator != (lldb::SBTypeNameSpecifier &rhs);
 
+        STRING_EXTENSION_LEVEL(SBTypeNameSpecifier, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             name = property(GetName)
diff --git a/lldb/scripts/interface/SBTypeSummary.i b/lldb/bindings/interface/SBTypeSummary.i
similarity index 97%
rename from lldb/scripts/interface/SBTypeSummary.i
rename to lldb/bindings/interface/SBTypeSummary.i
index 225a404cf73c4..adcc79b5a6ee8 100644
--- a/lldb/scripts/interface/SBTypeSummary.i
+++ b/lldb/bindings/interface/SBTypeSummary.i
@@ -101,6 +101,8 @@ namespace lldb {
         bool
         operator != (lldb::SBTypeSummary &rhs);
 
+        STRING_EXTENSION_LEVEL(SBTypeSummary, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             options = property(GetOptions, SetOptions)
diff --git a/lldb/scripts/interface/SBTypeSynthetic.i b/lldb/bindings/interface/SBTypeSynthetic.i
similarity index 95%
rename from lldb/scripts/interface/SBTypeSynthetic.i
rename to lldb/bindings/interface/SBTypeSynthetic.i
index d9d75e4c9efae..f57139ebf9f17 100644
--- a/lldb/scripts/interface/SBTypeSynthetic.i
+++ b/lldb/bindings/interface/SBTypeSynthetic.i
@@ -63,6 +63,8 @@ namespace lldb {
         bool
         operator != (lldb::SBTypeSynthetic &rhs);
 
+        STRING_EXTENSION_LEVEL(SBTypeSynthetic, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             options = property(GetOptions, SetOptions)
diff --git a/lldb/scripts/interface/SBUnixSignals.i b/lldb/bindings/interface/SBUnixSignals.i
similarity index 100%
rename from lldb/scripts/interface/SBUnixSignals.i
rename to lldb/bindings/interface/SBUnixSignals.i
diff --git a/lldb/scripts/interface/SBValue.i b/lldb/bindings/interface/SBValue.i
similarity index 99%
rename from lldb/scripts/interface/SBValue.i
rename to lldb/bindings/interface/SBValue.i
index 8647854e89c15..fb899805c3951 100644
--- a/lldb/scripts/interface/SBValue.i
+++ b/lldb/bindings/interface/SBValue.i
@@ -440,6 +440,8 @@ public:
                        const SBExpressionOptions &options,
                        const char *name) const;
 
+    STRING_EXTENSION(SBValue)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __get_dynamic__ (self):
diff --git a/lldb/scripts/interface/SBValueList.i b/lldb/bindings/interface/SBValueList.i
similarity index 85%
rename from lldb/scripts/interface/SBValueList.i
rename to lldb/bindings/interface/SBValueList.i
index 56ef19054e4c5..17ba2056f0c23 100644
--- a/lldb/scripts/interface/SBValueList.i
+++ b/lldb/bindings/interface/SBValueList.i
@@ -101,6 +101,29 @@ public:
     lldb::SBValue
     GetFirstValueByName (const char* name) const;
 
+    %extend {
+       %nothreadallow;
+       std::string lldb::SBValueList::__str__ (){
+           lldb::SBStream description;
+           const size_t n = $self->GetSize();
+           if (n)
+           {
+               for (size_t i=0; i<n; ++i)
+                   $self->GetValueAtIndex(i).GetDescription(description);
+           }
+           else
+           {
+               description.Printf("<empty> lldb.SBValueList()");
+           }
+           const char *desc = description.GetData();
+           size_t desc_len = description.GetSize();
+           if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
+               --desc_len;
+           return std::string(desc, desc_len);
+       }
+       %clearnothreadallow;
+    }
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBVariablesOptions.i b/lldb/bindings/interface/SBVariablesOptions.i
similarity index 100%
rename from lldb/scripts/interface/SBVariablesOptions.i
rename to lldb/bindings/interface/SBVariablesOptions.i
diff --git a/lldb/scripts/interface/SBWatchpoint.i b/lldb/bindings/interface/SBWatchpoint.i
similarity index 96%
rename from lldb/scripts/interface/SBWatchpoint.i
rename to lldb/bindings/interface/SBWatchpoint.i
index e11c4f213ca2e..cb0bc5f9859ac 100644
--- a/lldb/scripts/interface/SBWatchpoint.i
+++ b/lldb/bindings/interface/SBWatchpoint.i
@@ -90,6 +90,7 @@ public:
     static lldb::SBWatchpoint
     GetWatchpointFromEvent (const lldb::SBEvent& event);
 
+    STRING_EXTENSION_LEVEL(SBWatchpoint, lldb::eDescriptionLevelVerbose)
 };
 
 } // namespace lldb
diff --git a/lldb/scripts/interfaces.swig b/lldb/bindings/interfaces.swig
similarity index 99%
rename from lldb/scripts/interfaces.swig
rename to lldb/bindings/interfaces.swig
index cc6bb91febdac..780fe34392ff5 100644
--- a/lldb/scripts/interfaces.swig
+++ b/lldb/bindings/interfaces.swig
@@ -32,8 +32,8 @@
 %include "./interface/SBEvent.i"
 %include "./interface/SBExecutionContext.i"
 %include "./interface/SBExpressionOptions.i"
-%include "./interface/SBFileSpec.i"
 %include "./interface/SBFile.i"
+%include "./interface/SBFileSpec.i"
 %include "./interface/SBFileSpecList.i"
 %include "./interface/SBFrame.i"
 %include "./interface/SBFunction.i"
@@ -75,9 +75,8 @@
 %include "./interface/SBTypeNameSpecifier.i"
 %include "./interface/SBTypeSummary.i"
 %include "./interface/SBTypeSynthetic.i"
+%include "./interface/SBUnixSignals.i"
 %include "./interface/SBValue.i"
 %include "./interface/SBValueList.i"
 %include "./interface/SBVariablesOptions.i"
 %include "./interface/SBWatchpoint.i"
-%include "./interface/SBUnixSignals.i"
-
diff --git a/lldb/scripts/lldb_lua.swig b/lldb/bindings/lua.swig
similarity index 93%
rename from lldb/scripts/lldb_lua.swig
rename to lldb/bindings/lua.swig
index bf8809015d9a3..3b279a6b69e7f 100644
--- a/lldb/scripts/lldb_lua.swig
+++ b/lldb/bindings/lua.swig
@@ -9,6 +9,7 @@
 %module lldb
 
 %include <std_string.i>
+%include "./macros.swig"
 %include "./headers.swig"
 
 %{
diff --git a/lldb/bindings/macros.swig b/lldb/bindings/macros.swig
new file mode 100644
index 0000000000000..0387f27f3cb9b
--- /dev/null
+++ b/lldb/bindings/macros.swig
@@ -0,0 +1,33 @@
+%define STRING_EXTENSION_LEVEL(Class, Level)
+%extend {
+  %nothreadallow;
+  std::string lldb:: ## Class ## ::__str__(){
+    lldb::SBStream stream;
+    $self->GetDescription (stream, Level);
+    const char *desc = stream.GetData();
+    size_t desc_len = stream.GetSize();
+    if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r')) {
+      --desc_len;
+    }
+    return std::string(desc, desc_len);
+  }
+  %clearnothreadallow;
+}
+%enddef
+
+%define STRING_EXTENSION(Class)
+%extend {
+  %nothreadallow;
+  std::string lldb:: ## Class ## ::__str__(){
+    lldb::SBStream stream;
+    $self->GetDescription (stream);
+    const char *desc = stream.GetData();
+    size_t desc_len = stream.GetSize();
+    if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r')) {
+      --desc_len;
+    }
+    return std::string(desc, desc_len);
+  }
+  %clearnothreadallow;
+}
+%enddef
diff --git a/lldb/scripts/lldb.swig b/lldb/bindings/python.swig
similarity index 86%
rename from lldb/scripts/lldb.swig
rename to lldb/bindings/python.swig
index c3b9083327410..56fab9ff17951 100644
--- a/lldb/scripts/lldb.swig
+++ b/lldb/bindings/python.swig
@@ -59,6 +59,23 @@ except ImportError:
 // Parameter types will be used in the autodoc string.
 %feature("autodoc", "1");
 
+%define ARRAYHELPER(type,name)
+%inline %{
+type *new_ ## name (int nitems) {
+   return (type *) malloc(sizeof(type)*nitems);
+}
+void delete_ ## name(type *t) {
+   free(t);
+}
+type name ## _get(type *t, int index) {
+   return t[index];
+}
+void name ## _set(type *t, int index, type val) {
+   t[index] = val;
+}
+%}
+%enddef
+
 %pythoncode%{
 import uuid
 import re
@@ -94,20 +111,21 @@ def lldb_iter(obj, getsize, getelem):
 %}
 
 %include <std_string.i>
-%include "./Python/python-typemaps.swig"
+%include "./python/python-typemaps.swig"
+%include "./macros.swig"
 %include "./headers.swig"
 
 %{
-#include "../source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h"
-#include "../scripts/Python/python-swigsafecast.swig"
+#include "../source/Plugins/ScriptInterpreter/python/PythonDataObjects.h"
+#include "../bindings/python/python-swigsafecast.swig"
 using namespace lldb_private;
 using namespace lldb_private::python;
 using namespace lldb;
 %}
 
 %include "./interfaces.swig"
-%include "./Python/python-extensions.swig"
-%include "./Python/python-wrapper.swig"
+%include "./python/python-extensions.swig"
+%include "./python/python-wrapper.swig"
 
 %pythoncode%{
 debugger_unique_id = 0
diff --git a/lldb/scripts/Python/createPythonInit.py b/lldb/bindings/python/createPythonInit.py
similarity index 100%
rename from lldb/scripts/Python/createPythonInit.py
rename to lldb/bindings/python/createPythonInit.py
diff --git a/lldb/bindings/python/python-extensions.swig b/lldb/bindings/python/python-extensions.swig
new file mode 100644
index 0000000000000..0b23fdd400068
--- /dev/null
+++ b/lldb/bindings/python/python-extensions.swig
@@ -0,0 +1,592 @@
+%extend lldb::SBBreakpoint {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBBroadcaster {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBCommandReturnObject {
+        /* the write() and flush() calls are not part of the SB API proper, and are solely for Python usage
+        they are meant to make an SBCommandReturnObject into a file-like object so that instructions of the sort
+        print >>sb_command_return_object, "something"
+        will work correctly */
+
+        void lldb::SBCommandReturnObject::write (const char* str)
+        {
+            if (str)
+                $self->Printf("%s",str);
+        }
+        void lldb::SBCommandReturnObject::flush ()
+        {}
+}
+
+%extend lldb::SBCompileUnit {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBDeclaration {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBFunction {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBLineEntry {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBModule {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBSection {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+%extend lldb::SBStream {
+        /* the write() and flush() calls are not part of the SB API proper, and are solely for Python usage
+        they are meant to make an SBStream into a file-like object so that instructions of the sort
+        print >>sb_stream, "something"
+        will work correctly */
+
+        void lldb::SBStream::write (const char* str)
+        {
+            if (str)
+                $self->Printf("%s",str);
+        }
+        void lldb::SBStream::flush ()
+        {}
+}
+%extend lldb::SBSymbol {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBTarget {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBTypeFilter {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBTypeNameSpecifier {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBTypeSummary {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBTypeSynthetic {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%extend lldb::SBThread {
+    %pythoncode %{
+        def __eq__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return False
+
+            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
+
+        def __ne__(self, rhs):
+            if not isinstance(rhs, type(self)):
+                return True
+
+            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
+    %}
+}
+
+%pythoncode %{
+
+def command(command_name=None, doc=None):
+    import lldb
+    """A decorator function that registers an LLDB command line
+        command that is bound to the function it is attached to."""
+    def callable(function):
+        """Registers an lldb command for the decorated function."""
+        command = "command script add -f %s.%s %s" % (function.__module__, function.__name__, command_name or function.__name__)
+        lldb.debugger.HandleCommand(command)
+        if doc:
+            function.__doc__ = doc
+        return function
+
+    return callable
+
+class declaration(object):
+    '''A class that represents a source declaration location with file, line and column.'''
+    def __init__(self, file, line, col):
+        self.file = file
+        self.line = line
+        self.col = col
+
+class value_iter(object):
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.index >= self.length:
+            raise StopIteration()
+        child_sbvalue = self.sbvalue.GetChildAtIndex(self.index)
+        self.index += 1
+        return value(child_sbvalue)
+
+    def next(self):
+        return self.__next__()
+
+    def __init__(self,value):
+        self.index = 0
+        self.sbvalue = value
+        if type(self.sbvalue) is value:
+            self.sbvalue = self.sbvalue.sbvalue
+        self.length = self.sbvalue.GetNumChildren()
+
+class value(object):
+    '''A class designed to wrap lldb.SBValue() objects so the resulting object
+    can be used as a variable would be in code. So if you have a Point structure
+    variable in your code in the current frame named "pt", you can initialize an instance
+    of this class with it:
+
+    pt = lldb.value(lldb.frame.FindVariable("pt"))
+    print pt
+    print pt.x
+    print pt.y
+
+    pt = lldb.value(lldb.frame.FindVariable("rectangle_array"))
+    print rectangle_array[12]
+    print rectangle_array[5].origin.x'''
+    def __init__(self, sbvalue):
+        self.sbvalue = sbvalue
+
+    def __nonzero__(self):
+        return self.sbvalue.__nonzero__()
+
+    def __bool__(self):
+        return self.sbvalue.__bool__()
+
+    def __str__(self):
+        return self.sbvalue.__str__()
+
+    def __getitem__(self, key):
+        # Allow array access if this value has children...
+        if type(key) is value:
+            key = int(key)
+        if type(key) is int:
+            child_sbvalue = (self.sbvalue.GetValueForExpressionPath("[%i]" % key))
+            if child_sbvalue and child_sbvalue.IsValid():
+                return value(child_sbvalue)
+            raise IndexError("Index '%d' is out of range" % key)
+        raise TypeError("No array item of type %s" % str(type(key)))
+
+    def __iter__(self):
+        return value_iter(self.sbvalue)
+
+    def __getattr__(self, name):
+        child_sbvalue = self.sbvalue.GetChildMemberWithName (name)
+        if child_sbvalue and child_sbvalue.IsValid():
+            return value(child_sbvalue)
+        raise AttributeError("Attribute '%s' is not defined" % name)
+
+    def __add__(self, other):
+        return int(self) + int(other)
+
+    def __sub__(self, other):
+        return int(self) - int(other)
+
+    def __mul__(self, other):
+        return int(self) * int(other)
+
+    def __floordiv__(self, other):
+        return int(self) // int(other)
+
+    def __mod__(self, other):
+        return int(self) % int(other)
+
+    def __divmod__(self, other):
+        return int(self) % int(other)
+
+    def __pow__(self, other):
+        return int(self) ** int(other)
+
+    def __lshift__(self, other):
+        return int(self) << int(other)
+
+    def __rshift__(self, other):
+        return int(self) >> int(other)
+
+    def __and__(self, other):
+        return int(self) & int(other)
+
+    def __xor__(self, other):
+        return int(self) ^ int(other)
+
+    def __or__(self, other):
+        return int(self) | int(other)
+
+    def __div__(self, other):
+        return int(self) / int(other)
+
+    def __truediv__(self, other):
+        return int(self) / int(other)
+
+    def __iadd__(self, other):
+        result = self.__add__(other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __isub__(self, other):
+        result = self.__sub__(other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __imul__(self, other):
+        result = self.__mul__(other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __idiv__(self, other):
+        result = self.__div__(other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __itruediv__(self, other):
+        result = self.__truediv__(other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __ifloordiv__(self, other):
+        result =  self.__floordiv__(self, other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __imod__(self, other):
+        result =  self.__and__(self, other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __ipow__(self, other):
+        result = self.__pow__(self, other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __ipow__(self, other, modulo):
+        result = self.__pow__(self, other, modulo)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __ilshift__(self, other):
+        result = self.__lshift__(other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __irshift__(self, other):
+        result =  self.__rshift__(other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __iand__(self, other):
+        result =  self.__and__(self, other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __ixor__(self, other):
+        result =  self.__xor__(self, other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __ior__(self, other):
+        result =  self.__ior__(self, other)
+        self.sbvalue.SetValueFromCString (str(result))
+        return result
+
+    def __neg__(self):
+        return -int(self)
+
+    def __pos__(self):
+        return +int(self)
+
+    def __abs__(self):
+        return abs(int(self))
+
+    def __invert__(self):
+        return ~int(self)
+
+    def __complex__(self):
+        return complex (int(self))
+
+    def __int__(self):
+        is_num,is_sign = is_numeric_type(self.sbvalue.GetType().GetCanonicalType().GetBasicType())
+        if is_num and not is_sign: return self.sbvalue.GetValueAsUnsigned()
+        return self.sbvalue.GetValueAsSigned()
+
+    def __long__(self):
+        return self.__int__()
+
+    def __float__(self):
+        return float (self.sbvalue.GetValueAsSigned())
+
+    def __oct__(self):
+        return '0%o' % self.sbvalue.GetValueAsUnsigned()
+
+    def __hex__(self):
+        return '0x%x' % self.sbvalue.GetValueAsUnsigned()
+
+    def __len__(self):
+        return self.sbvalue.GetNumChildren()
+
+    def __eq__(self, other):
+        if type(other) is int:
+                return int(self) == other
+        elif type(other) is str:
+                return str(self) == other
+        elif type(other) is value:
+                self_err = SBError()
+                other_err = SBError()
+                self_val = self.sbvalue.GetValueAsUnsigned(self_err)
+                if self_err.fail:
+                        raise ValueError("unable to extract value of self")
+                other_val = other.sbvalue.GetValueAsUnsigned(other_err)
+                if other_err.fail:
+                        raise ValueError("unable to extract value of other")
+                return self_val == other_val
+        raise TypeError("Unknown type %s, No equality operation defined." % str(type(other)))
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+%}
+
+%pythoncode %{
+
+class SBSyntheticValueProvider(object):
+    def __init__(self,valobj):
+        pass
+
+    def num_children(self):
+        return 0
+
+    def get_child_index(self,name):
+        return None
+
+    def get_child_at_index(self,idx):
+        return None
+
+    def update(self):
+        pass
+
+    def has_children(self):
+        return False
+
+
+%}
+
+%pythoncode %{
+
+# given an lldb.SBBasicType it returns a tuple
+# (is_numeric, is_signed)
+# the value of is_signed is undefined if is_numeric == false
+def is_numeric_type(basic_type):
+    if basic_type == eBasicTypeInvalid: return (False,False)
+    if basic_type == eBasicTypeVoid: return (False,False)
+    if basic_type == eBasicTypeChar: return (True,False)
+    if basic_type == eBasicTypeSignedChar: return (True,True)
+    if basic_type == eBasicTypeUnsignedChar: return (True,False)
+    if basic_type == eBasicTypeWChar: return (True,False)
+    if basic_type == eBasicTypeSignedWChar: return (True,True)
+    if basic_type == eBasicTypeUnsignedWChar: return (True,False)
+    if basic_type == eBasicTypeChar16: return (True,False)
+    if basic_type == eBasicTypeChar32: return (True,False)
+    if basic_type == eBasicTypeShort: return (True,True)
+    if basic_type == eBasicTypeUnsignedShort: return (True,False)
+    if basic_type == eBasicTypeInt: return (True,True)
+    if basic_type == eBasicTypeUnsignedInt: return (True,False)
+    if basic_type == eBasicTypeLong: return (True,True)
+    if basic_type == eBasicTypeUnsignedLong: return (True,False)
+    if basic_type == eBasicTypeLongLong: return (True,True)
+    if basic_type == eBasicTypeUnsignedLongLong: return (True,False)
+    if basic_type == eBasicTypeInt128: return (True,True)
+    if basic_type == eBasicTypeUnsignedInt128: return (True,False)
+    if basic_type == eBasicTypeBool: return (False,False)
+    if basic_type == eBasicTypeHalf: return (True,True)
+    if basic_type == eBasicTypeFloat: return (True,True)
+    if basic_type == eBasicTypeDouble: return (True,True)
+    if basic_type == eBasicTypeLongDouble: return (True,True)
+    if basic_type == eBasicTypeFloatComplex: return (True,True)
+    if basic_type == eBasicTypeDoubleComplex: return (True,True)
+    if basic_type == eBasicTypeLongDoubleComplex: return (True,True)
+    if basic_type == eBasicTypeObjCID: return (False,False)
+    if basic_type == eBasicTypeObjCClass: return (False,False)
+    if basic_type == eBasicTypeObjCSel: return (False,False)
+    if basic_type == eBasicTypeNullPtr: return (False,False)
+    #if basic_type == eBasicTypeOther:
+    return (False,False)
+
+%}
diff --git a/lldb/scripts/Python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig
similarity index 100%
rename from lldb/scripts/Python/python-swigsafecast.swig
rename to lldb/bindings/python/python-swigsafecast.swig
diff --git a/lldb/scripts/Python/python-typemaps.swig b/lldb/bindings/python/python-typemaps.swig
similarity index 100%
rename from lldb/scripts/Python/python-typemaps.swig
rename to lldb/bindings/python/python-typemaps.swig
diff --git a/lldb/scripts/Python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig
similarity index 100%
rename from lldb/scripts/Python/python-wrapper.swig
rename to lldb/bindings/python/python-wrapper.swig
diff --git a/lldb/docs/CMakeLists.txt b/lldb/docs/CMakeLists.txt
index 0082d004bd0d6..8fa46860e5cec 100644
--- a/lldb/docs/CMakeLists.txt
+++ b/lldb/docs/CMakeLists.txt
@@ -30,9 +30,9 @@ if (LLDB_ENABLE_PYTHON)
     # Because we don't build liblldb, epydoc will complain that the import of
     # _lldb.so failed, but that doesn't prevent it from generating the docs.
     file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lldb)
-    get_target_property(lldb_scripts_dir swig_wrapper BINARY_DIR)
+    get_target_property(lldb_bindings_dir swig_wrapper BINARY_DIR)
     add_custom_target(lldb-python-doc-package
-      COMMAND "${CMAKE_COMMAND}" -E copy "${lldb_scripts_dir}/lldb.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/__init__.py"
+      COMMAND "${CMAKE_COMMAND}" -E copy "${lldb_bindings_dir}/lldb.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/__init__.py"
       COMMENT "Copying lldb.py to pretend package.")
     add_dependencies(lldb-python-doc-package swig_wrapper)
 
diff --git a/lldb/include/lldb/Expression/DWARFExpression.h b/lldb/include/lldb/Expression/DWARFExpression.h
index 1e32957443fd3..bfae142d5e016 100644
--- a/lldb/include/lldb/Expression/DWARFExpression.h
+++ b/lldb/include/lldb/Expression/DWARFExpression.h
@@ -34,15 +34,6 @@ namespace lldb_private {
 /// location expression or a location list and interprets it.
 class DWARFExpression {
 public:
-  enum LocationListFormat : uint8_t {
-    NonLocationList,     // Not a location list
-    RegularLocationList, // Location list format used in non-split dwarf files
-    SplitDwarfLocationList, // Location list format used in pre-DWARF v5 split
-                            // dwarf files (.debug_loc.dwo)
-    LocLists,               // Location list format used in DWARF v5
-                            // (.debug_loclists/.debug_loclists.dwo).
-  };
-
   DWARFExpression();
 
   /// Constructor
diff --git a/lldb/include/lldb/Interpreter/OptionValue.h b/lldb/include/lldb/Interpreter/OptionValue.h
index 734c92b4bcada..44c7f621a5824 100644
--- a/lldb/include/lldb/Interpreter/OptionValue.h
+++ b/lldb/include/lldb/Interpreter/OptionValue.h
@@ -58,8 +58,7 @@ class OptionValue {
     eDumpGroupExport = (eDumpOptionCommand | eDumpOptionName | eDumpOptionValue)
   };
 
-  OptionValue()
-      : m_callback(nullptr), m_baton(nullptr), m_value_was_set(false) {}
+  OptionValue() : m_value_was_set(false) {}
 
   virtual ~OptionValue() = default;
 
@@ -304,22 +303,19 @@ class OptionValue {
     m_parent_wp = parent_sp;
   }
 
-  void SetValueChangedCallback(OptionValueChangedCallback callback,
-                               void *baton) {
-    assert(m_callback == nullptr);
-    m_callback = callback;
-    m_baton = baton;
+  void SetValueChangedCallback(std::function<void()> callback) {
+    assert(!m_callback);
+    m_callback = std::move(callback);
   }
 
   void NotifyValueChanged() {
     if (m_callback)
-      m_callback(m_baton, this);
+      m_callback();
   }
 
 protected:
   lldb::OptionValueWP m_parent_wp;
-  OptionValueChangedCallback m_callback;
-  void *m_baton;
+  std::function<void()> m_callback;
   bool m_value_was_set; // This can be used to see if a value has been set
                         // by a call to SetValueFromCString(). It is often
                         // handy to know if an option value was set from the
diff --git a/lldb/include/lldb/Interpreter/OptionValueProperties.h b/lldb/include/lldb/Interpreter/OptionValueProperties.h
index bea2b3c91e009..980f01183ef56 100644
--- a/lldb/include/lldb/Interpreter/OptionValueProperties.h
+++ b/lldb/include/lldb/Interpreter/OptionValueProperties.h
@@ -198,8 +198,7 @@ class OptionValueProperties
                                                ConstString name);
 
   void SetValueChangedCallback(uint32_t property_idx,
-                               OptionValueChangedCallback callback,
-                               void *baton);
+                               std::function<void()> callback);
 
 protected:
   Property *ProtectedGetPropertyAtIndex(uint32_t idx) {
diff --git a/lldb/include/lldb/Interpreter/Property.h b/lldb/include/lldb/Interpreter/Property.h
index 797aee4be8159..76264832705ba 100644
--- a/lldb/include/lldb/Interpreter/Property.h
+++ b/lldb/include/lldb/Interpreter/Property.h
@@ -64,8 +64,7 @@ class Property {
                        uint32_t output_width,
                        bool display_qualified_name) const;
 
-  void SetValueChangedCallback(OptionValueChangedCallback callback,
-                               void *baton);
+  void SetValueChangedCallback(std::function<void()> callback);
 
 protected:
   ConstString m_name;
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index 47c5c78704052..2ba996d4995f5 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -85,9 +85,6 @@ class ProcessProperties : public Properties {
   std::chrono::seconds GetUtilityExpressionTimeout() const;
 
 protected:
-  static void OptionValueChangedCallback(void *baton,
-                                         OptionValue *option_value);
-
   Process *m_process; // Can be nullptr for global ProcessProperties
 };
 
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index 6f8d60731acf5..1e9153c401ef1 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -209,26 +209,15 @@ class TargetProperties : public Properties {
 
 private:
   // Callbacks for m_launch_info.
-  static void Arg0ValueChangedCallback(void *target_property_ptr,
-                                       OptionValue *);
-  static void RunArgsValueChangedCallback(void *target_property_ptr,
-                                          OptionValue *);
-  static void EnvVarsValueChangedCallback(void *target_property_ptr,
-                                          OptionValue *);
-  static void InheritEnvValueChangedCallback(void *target_property_ptr,
-                                             OptionValue *);
-  static void InputPathValueChangedCallback(void *target_property_ptr,
-                                            OptionValue *);
-  static void OutputPathValueChangedCallback(void *target_property_ptr,
-                                             OptionValue *);
-  static void ErrorPathValueChangedCallback(void *target_property_ptr,
-                                            OptionValue *);
-  static void DetachOnErrorValueChangedCallback(void *target_property_ptr,
-                                                OptionValue *);
-  static void DisableASLRValueChangedCallback(void *target_property_ptr,
-                                              OptionValue *);
-  static void DisableSTDIOValueChangedCallback(void *target_property_ptr,
-                                               OptionValue *);
+  void Arg0ValueChangedCallback();
+  void RunArgsValueChangedCallback();
+  void EnvVarsValueChangedCallback();
+  void InputPathValueChangedCallback();
+  void OutputPathValueChangedCallback();
+  void ErrorPathValueChangedCallback();
+  void DetachOnErrorValueChangedCallback();
+  void DisableASLRValueChangedCallback();
+  void DisableSTDIOValueChangedCallback();
 
   // Member variables.
   ProcessLaunchInfo m_launch_info;
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index 04b78bcc19f8e..27a2c4c3f27ff 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -82,8 +82,6 @@ typedef bool (*BreakpointHitCallback)(void *baton,
 typedef bool (*WatchpointHitCallback)(void *baton,
                                       StoppointCallbackContext *context,
                                       lldb::user_id_t watch_id);
-typedef void (*OptionValueChangedCallback)(void *baton,
-                                           OptionValue *option_value);
 typedef bool (*ThreadPlanShouldStopHereCallback)(
     ThreadPlan *current_plan, Flags &flags, lldb::FrameComparison operation,
     Status &status, void *baton);
diff --git a/lldb/packages/Python/lldbsuite/test/commands/target/create-no-such-arch/TestNoSuchArch.py b/lldb/packages/Python/lldbsuite/test/commands/target/create-no-such-arch/TestNoSuchArch.py
index a780ca2756647..4d7f0838f8776 100644
--- a/lldb/packages/Python/lldbsuite/test/commands/target/create-no-such-arch/TestNoSuchArch.py
+++ b/lldb/packages/Python/lldbsuite/test/commands/target/create-no-such-arch/TestNoSuchArch.py
@@ -19,8 +19,8 @@ def test(self):
         # Check that passing an invalid arch via the command-line fails but
         # doesn't crash
         self.expect(
-            "target crete --arch nothingtoseehere %s" %
-            (exe), error=True)
+            "target create --arch nothingtoseehere %s" %
+            (exe), error=True, substrs=["error: invalid triple 'nothingtoseehere'"])
 
         # Check that passing an invalid arch via the SB API fails but doesn't
         # crash
diff --git a/lldb/scripts/Python/python-extensions.swig b/lldb/scripts/Python/python-extensions.swig
deleted file mode 100644
index dbd4b1d79d005..0000000000000
--- a/lldb/scripts/Python/python-extensions.swig
+++ /dev/null
@@ -1,1109 +0,0 @@
-%extend lldb::SBAddress {
-        %nothreadallow;
-        PyObject *lldb::SBAddress::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBBlock {
-        %nothreadallow;
-        PyObject *lldb::SBBlock::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBBreakpoint {
-        %nothreadallow;
-        PyObject *lldb::SBBreakpoint::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-
-}
-%extend lldb::SBBreakpointLocation {
-        %nothreadallow;
-        PyObject *lldb::SBBreakpointLocation::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelFull);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
-%extend lldb::SBBreakpointName {
-        %nothreadallow;
-        PyObject *lldb::SBBreakpointName::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
-%extend lldb::SBBroadcaster {
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-
-%extend lldb::SBCommandReturnObject {
-        %nothreadallow;
-        PyObject *lldb::SBCommandReturnObject::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-
-        /* the write() and flush() calls are not part of the SB API proper, and are solely for Python usage
-        they are meant to make an SBCommandReturnObject into a file-like object so that instructions of the sort
-        print >>sb_command_return_object, "something"
-        will work correctly */
-
-        void lldb::SBCommandReturnObject::write (const char* str)
-        {
-            if (str)
-                $self->Printf("%s",str);
-        }
-        void lldb::SBCommandReturnObject::flush ()
-        {}
-}
-%extend lldb::SBCompileUnit {
-        %nothreadallow;
-        PyObject *lldb::SBCompileUnit::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-%extend lldb::SBData {
-        %nothreadallow;
-        PyObject *lldb::SBData::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBDebugger {
-        %nothreadallow;
-        PyObject *lldb::SBDebugger::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBDeclaration {
-        %nothreadallow;
-        PyObject *lldb::SBDeclaration::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-
-}
-%extend lldb::SBError {
-        %nothreadallow;
-        PyObject *lldb::SBError::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBFileSpec {
-        %nothreadallow;
-        PyObject *lldb::SBFileSpec::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBFrame {
-        %nothreadallow;
-        PyObject *lldb::SBFrame::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBFunction {
-        %nothreadallow;
-        PyObject *lldb::SBFunction::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-
-}
-%extend lldb::SBInstruction {
-        %nothreadallow;
-        PyObject *lldb::SBInstruction::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBInstructionList {
-        %nothreadallow;
-        PyObject *lldb::SBInstructionList::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBLineEntry {
-        %nothreadallow;
-        PyObject *lldb::SBLineEntry::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-
-%extend lldb::SBMemoryRegionInfo {
-        %nothreadallow;
-        PyObject *lldb::SBMemoryRegionInfo::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
-%extend lldb::SBModule {
-        %nothreadallow;
-        PyObject *lldb::SBModule::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-
-%extend lldb::SBModuleSpec {
-        %nothreadallow;
-        PyObject *lldb::SBModuleSpec::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
-%extend lldb::SBModuleSpecList {
-        %nothreadallow;
-        PyObject *lldb::SBModuleSpecList::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
-%extend lldb::SBProcess {
-        %nothreadallow;
-        PyObject *lldb::SBProcess::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBSection {
-        %nothreadallow;
-        PyObject *lldb::SBSection::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-%extend lldb::SBStream {
-        /* the write() and flush() calls are not part of the SB API proper, and are solely for Python usage
-        they are meant to make an SBStream into a file-like object so that instructions of the sort
-        print >>sb_stream, "something"
-        will work correctly */
-
-        void lldb::SBStream::write (const char* str)
-        {
-            if (str)
-                $self->Printf("%s",str);
-        }
-        void lldb::SBStream::flush ()
-        {}
-}
-%extend lldb::SBSymbol {
-        %nothreadallow;
-        PyObject *lldb::SBSymbol::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-%extend lldb::SBSymbolContext {
-        %nothreadallow;
-        PyObject *lldb::SBSymbolContext::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBSymbolContextList {
-        %nothreadallow;
-        PyObject *lldb::SBSymbolContextList::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
-%extend lldb::SBTarget {
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-
-%extend lldb::SBType {
-        %nothreadallow;
-        PyObject *lldb::SBType::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBTypeCategory {
-        %nothreadallow;
-        PyObject *lldb::SBTypeCategory::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBTypeFilter {
-        %nothreadallow;
-        PyObject *lldb::SBTypeFilter::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-%extend lldb::SBTypeFormat {
-        %nothreadallow;
-        PyObject *lldb::SBTypeFormat::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBTypeMember {
-        %nothreadallow;
-        PyObject *lldb::SBTypeMember::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBTypeMemberFunction {
-        %nothreadallow;
-        PyObject *lldb::SBTypeMemberFunction::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBTypeEnumMember {
-        %nothreadallow;
-        PyObject *lldb::SBTypeEnumMember::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBTypeNameSpecifier {
-        %nothreadallow;
-        PyObject *lldb::SBTypeNameSpecifier::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-%extend lldb::SBTypeSummary {
-        %nothreadallow;
-        PyObject *lldb::SBTypeSummary::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-%extend lldb::SBTypeSynthetic {
-        %nothreadallow;
-        PyObject *lldb::SBTypeSynthetic::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-%extend lldb::SBThread {
-        %nothreadallow;
-        PyObject *lldb::SBThread::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-    %pythoncode %{
-        def __eq__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return False
-
-            return getattr(_lldb,self.__class__.__name__+"___eq__")(self, rhs)
-
-        def __ne__(self, rhs):
-            if not isinstance(rhs, type(self)):
-                return True
-
-            return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
-    %}
-}
-%extend lldb::SBValue {
-        %nothreadallow;
-        PyObject *lldb::SBValue::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBValueList {
-        %nothreadallow;
-        PyObject *lldb::SBValueList::__str__ (){
-                lldb::SBStream description;
-                const size_t n = $self->GetSize();
-                if (n)
-                {
-                    for (size_t i=0; i<n; ++i)
-                        $self->GetValueAtIndex(i).GetDescription(description);
-                }
-                else
-                {
-                    description.Printf("<empty> lldb.SBValueList()");
-                }
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBWatchpoint {
-        %nothreadallow;
-        PyObject *lldb::SBWatchpoint::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelVerbose);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
-
-// %extend lldb::SBDebugger {
-//         // FIXME: We can't get the callback and baton
-//         PyObject *lldb::SBDebugger (){
-//             // Only call Py_XDECREF if we have a Python object (or NULL)
-//             if (LLDBSwigPythonCallPythonLogOutputCallback == $self->GetLogOutPutCallback())
-//                 Py_XDECREF($self->GetCallbackBaton());
-//         }
-// }
-
-%pythoncode %{
-
-def command(command_name=None, doc=None):
-    import lldb
-    """A decorator function that registers an LLDB command line
-        command that is bound to the function it is attached to."""
-    def callable(function):
-        """Registers an lldb command for the decorated function."""
-        command = "command script add -f %s.%s %s" % (function.__module__, function.__name__, command_name or function.__name__)
-        lldb.debugger.HandleCommand(command)
-        if doc:
-            function.__doc__ = doc
-        return function
-
-    return callable
-
-class declaration(object):
-    '''A class that represents a source declaration location with file, line and column.'''
-    def __init__(self, file, line, col):
-        self.file = file
-        self.line = line
-        self.col = col
-
-class value_iter(object):
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if self.index >= self.length:
-            raise StopIteration()
-        child_sbvalue = self.sbvalue.GetChildAtIndex(self.index)
-        self.index += 1
-        return value(child_sbvalue)
-
-    def next(self):
-        return self.__next__()
-
-    def __init__(self,value):
-        self.index = 0
-        self.sbvalue = value
-        if type(self.sbvalue) is value:
-            self.sbvalue = self.sbvalue.sbvalue
-        self.length = self.sbvalue.GetNumChildren()
-
-class value(object):
-    '''A class designed to wrap lldb.SBValue() objects so the resulting object
-    can be used as a variable would be in code. So if you have a Point structure
-    variable in your code in the current frame named "pt", you can initialize an instance
-    of this class with it:
-
-    pt = lldb.value(lldb.frame.FindVariable("pt"))
-    print pt
-    print pt.x
-    print pt.y
-
-    pt = lldb.value(lldb.frame.FindVariable("rectangle_array"))
-    print rectangle_array[12]
-    print rectangle_array[5].origin.x'''
-    def __init__(self, sbvalue):
-        self.sbvalue = sbvalue
-
-    def __nonzero__(self):
-        return self.sbvalue.__nonzero__()
-
-    def __bool__(self):
-        return self.sbvalue.__bool__()
-
-    def __str__(self):
-        return self.sbvalue.__str__()
-
-    def __getitem__(self, key):
-        # Allow array access if this value has children...
-        if type(key) is value:
-            key = int(key)
-        if type(key) is int:
-            child_sbvalue = (self.sbvalue.GetValueForExpressionPath("[%i]" % key))
-            if child_sbvalue and child_sbvalue.IsValid():
-                return value(child_sbvalue)
-            raise IndexError("Index '%d' is out of range" % key)
-        raise TypeError("No array item of type %s" % str(type(key)))
-
-    def __iter__(self):
-        return value_iter(self.sbvalue)
-
-    def __getattr__(self, name):
-        child_sbvalue = self.sbvalue.GetChildMemberWithName (name)
-        if child_sbvalue and child_sbvalue.IsValid():
-            return value(child_sbvalue)
-        raise AttributeError("Attribute '%s' is not defined" % name)
-
-    def __add__(self, other):
-        return int(self) + int(other)
-
-    def __sub__(self, other):
-        return int(self) - int(other)
-
-    def __mul__(self, other):
-        return int(self) * int(other)
-
-    def __floordiv__(self, other):
-        return int(self) // int(other)
-
-    def __mod__(self, other):
-        return int(self) % int(other)
-
-    def __divmod__(self, other):
-        return int(self) % int(other)
-
-    def __pow__(self, other):
-        return int(self) ** int(other)
-
-    def __lshift__(self, other):
-        return int(self) << int(other)
-
-    def __rshift__(self, other):
-        return int(self) >> int(other)
-
-    def __and__(self, other):
-        return int(self) & int(other)
-
-    def __xor__(self, other):
-        return int(self) ^ int(other)
-
-    def __or__(self, other):
-        return int(self) | int(other)
-
-    def __div__(self, other):
-        return int(self) / int(other)
-
-    def __truediv__(self, other):
-        return int(self) / int(other)
-
-    def __iadd__(self, other):
-        result = self.__add__(other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __isub__(self, other):
-        result = self.__sub__(other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __imul__(self, other):
-        result = self.__mul__(other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __idiv__(self, other):
-        result = self.__div__(other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __itruediv__(self, other):
-        result = self.__truediv__(other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __ifloordiv__(self, other):
-        result =  self.__floordiv__(self, other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __imod__(self, other):
-        result =  self.__and__(self, other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __ipow__(self, other):
-        result = self.__pow__(self, other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __ipow__(self, other, modulo):
-        result = self.__pow__(self, other, modulo)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __ilshift__(self, other):
-        result = self.__lshift__(other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __irshift__(self, other):
-        result =  self.__rshift__(other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __iand__(self, other):
-        result =  self.__and__(self, other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __ixor__(self, other):
-        result =  self.__xor__(self, other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __ior__(self, other):
-        result =  self.__ior__(self, other)
-        self.sbvalue.SetValueFromCString (str(result))
-        return result
-
-    def __neg__(self):
-        return -int(self)
-
-    def __pos__(self):
-        return +int(self)
-
-    def __abs__(self):
-        return abs(int(self))
-
-    def __invert__(self):
-        return ~int(self)
-
-    def __complex__(self):
-        return complex (int(self))
-
-    def __int__(self):
-        is_num,is_sign = is_numeric_type(self.sbvalue.GetType().GetCanonicalType().GetBasicType())
-        if is_num and not is_sign: return self.sbvalue.GetValueAsUnsigned()
-        return self.sbvalue.GetValueAsSigned()
-
-    def __long__(self):
-        return self.__int__()
-
-    def __float__(self):
-        return float (self.sbvalue.GetValueAsSigned())
-
-    def __oct__(self):
-        return '0%o' % self.sbvalue.GetValueAsUnsigned()
-
-    def __hex__(self):
-        return '0x%x' % self.sbvalue.GetValueAsUnsigned()
-
-    def __len__(self):
-        return self.sbvalue.GetNumChildren()
-
-    def __eq__(self, other):
-        if type(other) is int:
-                return int(self) == other
-        elif type(other) is str:
-                return str(self) == other
-        elif type(other) is value:
-                self_err = SBError()
-                other_err = SBError()
-                self_val = self.sbvalue.GetValueAsUnsigned(self_err)
-                if self_err.fail:
-                        raise ValueError("unable to extract value of self")
-                other_val = other.sbvalue.GetValueAsUnsigned(other_err)
-                if other_err.fail:
-                        raise ValueError("unable to extract value of other")
-                return self_val == other_val
-        raise TypeError("Unknown type %s, No equality operation defined." % str(type(other)))
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
-%}
-
-%pythoncode %{
-
-class SBSyntheticValueProvider(object):
-    def __init__(self,valobj):
-        pass
-
-    def num_children(self):
-        return 0
-
-    def get_child_index(self,name):
-        return None
-
-    def get_child_at_index(self,idx):
-        return None
-
-    def update(self):
-        pass
-
-    def has_children(self):
-        return False
-
-
-%}
-
-%pythoncode %{
-
-# given an lldb.SBBasicType it returns a tuple
-# (is_numeric, is_signed)
-# the value of is_signed is undefined if is_numeric == false
-def is_numeric_type(basic_type):
-    if basic_type == eBasicTypeInvalid: return (False,False)
-    if basic_type == eBasicTypeVoid: return (False,False)
-    if basic_type == eBasicTypeChar: return (True,False)
-    if basic_type == eBasicTypeSignedChar: return (True,True)
-    if basic_type == eBasicTypeUnsignedChar: return (True,False)
-    if basic_type == eBasicTypeWChar: return (True,False)
-    if basic_type == eBasicTypeSignedWChar: return (True,True)
-    if basic_type == eBasicTypeUnsignedWChar: return (True,False)
-    if basic_type == eBasicTypeChar16: return (True,False)
-    if basic_type == eBasicTypeChar32: return (True,False)
-    if basic_type == eBasicTypeShort: return (True,True)
-    if basic_type == eBasicTypeUnsignedShort: return (True,False)
-    if basic_type == eBasicTypeInt: return (True,True)
-    if basic_type == eBasicTypeUnsignedInt: return (True,False)
-    if basic_type == eBasicTypeLong: return (True,True)
-    if basic_type == eBasicTypeUnsignedLong: return (True,False)
-    if basic_type == eBasicTypeLongLong: return (True,True)
-    if basic_type == eBasicTypeUnsignedLongLong: return (True,False)
-    if basic_type == eBasicTypeInt128: return (True,True)
-    if basic_type == eBasicTypeUnsignedInt128: return (True,False)
-    if basic_type == eBasicTypeBool: return (False,False)
-    if basic_type == eBasicTypeHalf: return (True,True)
-    if basic_type == eBasicTypeFloat: return (True,True)
-    if basic_type == eBasicTypeDouble: return (True,True)
-    if basic_type == eBasicTypeLongDouble: return (True,True)
-    if basic_type == eBasicTypeFloatComplex: return (True,True)
-    if basic_type == eBasicTypeDoubleComplex: return (True,True)
-    if basic_type == eBasicTypeLongDoubleComplex: return (True,True)
-    if basic_type == eBasicTypeObjCID: return (False,False)
-    if basic_type == eBasicTypeObjCClass: return (False,False)
-    if basic_type == eBasicTypeObjCSel: return (False,False)
-    if basic_type == eBasicTypeNullPtr: return (False,False)
-    #if basic_type == eBasicTypeOther:
-    return (False,False)
-
-%}
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index eea409bed185a..e0ecf29b502b7 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -5,13 +5,13 @@ endif()
 get_property(LLDB_ALL_PLUGINS GLOBAL PROPERTY LLDB_PLUGINS)
 
 if(LLDB_ENABLE_PYTHON)
-  get_target_property(lldb_scripts_dir swig_wrapper BINARY_DIR)
-  set(lldb_python_wrapper ${lldb_scripts_dir}/LLDBWrapPython.cpp)
+  get_target_property(lldb_bindings_dir swig_wrapper BINARY_DIR)
+  set(lldb_python_wrapper ${lldb_bindings_dir}/LLDBWrapPython.cpp)
 endif()
 
 if(LLDB_ENABLE_LUA)
-  get_target_property(lldb_scripts_dir swig_wrapper_lua BINARY_DIR)
-  set(lldb_lua_wrapper ${lldb_scripts_dir}/LLDBWrapLua.cpp)
+  get_target_property(lldb_bindings_dir swig_wrapper_lua BINARY_DIR)
+  set(lldb_lua_wrapper ${lldb_bindings_dir}/LLDBWrapLua.cpp)
 endif()
 
 if(LLDB_BUILD_FRAMEWORK)
diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp
index 4a9acab2e27c1..b1d7eee108b7d 100644
--- a/lldb/source/Core/Section.cpp
+++ b/lldb/source/Core/Section.cpp
@@ -280,29 +280,6 @@ bool Section::ContainsFileAddress(addr_t vm_addr) const {
   return false;
 }
 
-int Section::Compare(const Section &a, const Section &b) {
-  if (&a == &b)
-    return 0;
-
-  const ModuleSP a_module_sp = a.GetModule();
-  const ModuleSP b_module_sp = b.GetModule();
-  if (a_module_sp == b_module_sp) {
-    user_id_t a_sect_uid = a.GetID();
-    user_id_t b_sect_uid = b.GetID();
-    if (a_sect_uid < b_sect_uid)
-      return -1;
-    if (a_sect_uid > b_sect_uid)
-      return 1;
-    return 0;
-  } else {
-    // The modules are different, just compare the module pointers
-    if (a_module_sp.get() < b_module_sp.get())
-      return -1;
-    else
-      return 1; // We already know the modules aren't equal
-  }
-}
-
 void Section::Dump(Stream *s, Target *target, uint32_t depth) const {
   //    s->Printf("%.*p: ", (int)sizeof(void*) * 2, this);
   s->Indent();
diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp
index c67e35b145189..69c84640ef93a 100644
--- a/lldb/source/Expression/DWARFExpression.cpp
+++ b/lldb/source/Expression/DWARFExpression.cpp
@@ -54,29 +54,6 @@ ReadAddressFromDebugAddrSection(const DWARFUnit *dwarf_cu,
   return LLDB_INVALID_ADDRESS;
 }
 
-/// Return the location list parser for the given format.
-static std::unique_ptr<llvm::DWARFLocationTable>
-GetLocationTable(DWARFExpression::LocationListFormat format, const DataExtractor &data) {
-  llvm::DWARFDataExtractor llvm_data(
-      toStringRef(data.GetData()),
-      data.GetByteOrder() == lldb::eByteOrderLittle, data.GetAddressByteSize());
-
-  switch (format) {
-  case DWARFExpression::NonLocationList:
-    return nullptr;
-  // DWARF<=4 .debug_loc
-  case DWARFExpression::RegularLocationList:
-    return std::make_unique<llvm::DWARFDebugLoc>(llvm_data);
-  // Non-standard DWARF 4 extension (fission) .debug_loc.dwo
-  case DWARFExpression::SplitDwarfLocationList:
-  // DWARF 5 .debug_loclists(.dwo)
-  case DWARFExpression::LocLists:
-    return std::make_unique<llvm::DWARFDebugLoclists>(
-        llvm_data, format == DWARFExpression::LocLists ? 5 : 4);
-  }
-  llvm_unreachable("Invalid LocationListFormat!");
-}
-
 // DWARFExpression constructor
 DWARFExpression::DWARFExpression()
     : m_module_wp(), m_data(), m_dwarf_cu(nullptr),
@@ -157,10 +134,8 @@ void DWARFExpression::GetDescription(Stream *s, lldb::DescriptionLevel level,
   if (IsLocationList()) {
     // We have a location list
     lldb::offset_t offset = 0;
-    std::unique_ptr<llvm::DWARFLocationTable> loctable_up = GetLocationTable(
-        m_dwarf_cu->GetSymbolFileDWARF().GetLocationListFormat(), m_data);
-    if (!loctable_up)
-      return;
+    std::unique_ptr<llvm::DWARFLocationTable> loctable_up =
+        m_dwarf_cu->GetLocationTable(m_data);
 
     llvm::MCRegisterInfo *MRI = abi ? &abi->GetMCRegisterInfo() : nullptr;
 
@@ -2812,10 +2787,8 @@ DWARFExpression::GetLocationExpression(addr_t load_function_start,
                                        addr_t addr) const {
   Log *log = GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS);
 
-  std::unique_ptr<llvm::DWARFLocationTable> loctable_up = GetLocationTable(
-      m_dwarf_cu->GetSymbolFileDWARF().GetLocationListFormat(), m_data);
-  if (!loctable_up)
-    return llvm::None;
+  std::unique_ptr<llvm::DWARFLocationTable> loctable_up =
+      m_dwarf_cu->GetLocationTable(m_data);
   llvm::Optional<DataExtractor> result;
   uint64_t offset = 0;
   auto lookup_addr =
diff --git a/lldb/source/Interpreter/OptionValueProperties.cpp b/lldb/source/Interpreter/OptionValueProperties.cpp
index 4dae930c3a6f4..21750cf186156 100644
--- a/lldb/source/Interpreter/OptionValueProperties.cpp
+++ b/lldb/source/Interpreter/OptionValueProperties.cpp
@@ -60,10 +60,10 @@ void OptionValueProperties::Initialize(const PropertyDefinitions &defs) {
 }
 
 void OptionValueProperties::SetValueChangedCallback(
-    uint32_t property_idx, OptionValueChangedCallback callback, void *baton) {
+    uint32_t property_idx, std::function<void()> callback) {
   Property *property = ProtectedGetPropertyAtIndex(property_idx);
   if (property)
-    property->SetValueChangedCallback(callback, baton);
+    property->SetValueChangedCallback(std::move(callback));
 }
 
 void OptionValueProperties::AppendProperty(ConstString name,
diff --git a/lldb/source/Interpreter/Property.cpp b/lldb/source/Interpreter/Property.cpp
index 78209311e2e51..a81098373c257 100644
--- a/lldb/source/Interpreter/Property.cpp
+++ b/lldb/source/Interpreter/Property.cpp
@@ -292,8 +292,7 @@ void Property::DumpDescription(CommandInterpreter &interpreter, Stream &strm,
   }
 }
 
-void Property::SetValueChangedCallback(OptionValueChangedCallback callback,
-                                       void *baton) {
+void Property::SetValueChangedCallback(std::function<void()> callback) {
   if (m_value_sp)
-    m_value_sp->SetValueChangedCallback(callback, baton);
+    m_value_sp->SetValueChangedCallback(std::move(callback));
 }
diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.cpp
index dc64139fa4e5c..1dd0a9eade0c8 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.cpp
@@ -10,9 +10,9 @@
 #include "llvm/Support/FormatVariadic.h"
 
 using namespace lldb_private;
+using namespace lldb;
 
 llvm::Error Lua::Run(llvm::StringRef buffer) {
-  std::lock_guard<std::mutex> lock(m_mutex);
   int error =
       luaL_loadbuffer(m_lua_state, buffer.data(), buffer.size(), "buffer") ||
       lua_pcall(m_lua_state, 0, 0, 0);
diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.h b/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.h
index ed1d159590ac5..adc6c61184367 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.h
@@ -9,6 +9,7 @@
 #ifndef liblldb_Lua_h_
 #define liblldb_Lua_h_
 
+#include "lldb/lldb-types.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 
@@ -38,7 +39,6 @@ class Lua {
   llvm::Error Run(llvm::StringRef buffer);
 
 private:
-  std::mutex m_mutex;
   lua_State *m_lua_state;
 };
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp
index d5423b78b8c43..e46851c450920 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp
@@ -27,7 +27,13 @@ class IOHandlerLuaInterpreter : public IOHandlerDelegate,
       : IOHandlerEditline(debugger, IOHandler::Type::LuaInterpreter, "lua",
                           ">>> ", "..> ", true, debugger.GetUseColor(), 0,
                           *this, nullptr),
-        m_script_interpreter(script_interpreter) {}
+        m_script_interpreter(script_interpreter) {
+    llvm::cantFail(m_script_interpreter.EnterSession(debugger.GetID()));
+  }
+
+  ~IOHandlerLuaInterpreter() {
+    llvm::cantFail(m_script_interpreter.LeaveSession());
+  }
 
   void IOHandlerInputComplete(IOHandler &io_handler,
                               std::string &data) override {
@@ -89,6 +95,33 @@ void ScriptInterpreterLua::Initialize() {
 
 void ScriptInterpreterLua::Terminate() {}
 
+llvm::Error ScriptInterpreterLua::EnterSession(user_id_t debugger_id) {
+  if (m_session_is_active)
+    return llvm::Error::success();
+
+  const char *fmt_str =
+      "lldb.debugger = lldb.SBDebugger.FindDebuggerWithID({0}); "
+      "lldb.target = lldb.debugger:GetSelectedTarget(); "
+      "lldb.process = lldb.target:GetProcess(); "
+      "lldb.thread = lldb.process:GetSelectedThread(); "
+      "lldb.frame = lldb.thread:GetSelectedFrame()";
+  return m_lua->Run(llvm::formatv(fmt_str, debugger_id).str());
+}
+
+llvm::Error ScriptInterpreterLua::LeaveSession() {
+  if (!m_session_is_active)
+    return llvm::Error::success();
+
+  m_session_is_active = false;
+
+  llvm::StringRef str = "lldb.debugger = nil; "
+                        "lldb.target = nil; "
+                        "lldb.process = nil; "
+                        "lldb.thread = nil; "
+                        "lldb.frame = nil";
+  return m_lua->Run(str);
+}
+
 lldb::ScriptInterpreterSP
 ScriptInterpreterLua::CreateInstance(Debugger &debugger) {
   return std::make_shared<ScriptInterpreterLua>(debugger);
diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h
index b34c7d0e82176..550e1035567ca 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h
@@ -43,8 +43,12 @@ class ScriptInterpreterLua : public ScriptInterpreter {
 
   Lua &GetLua();
 
+  llvm::Error EnterSession(lldb::user_id_t debugger_id);
+  llvm::Error LeaveSession();
+
 private:
   std::unique_ptr<Lua> m_lua;
+  bool m_session_is_active = false;
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index bb75c699352e7..232063a6f3390 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -2298,54 +2298,6 @@ size_t DWARFASTParserClang::ParseChildEnumerators(
   return enumerators_added;
 }
 
-#if defined(LLDB_CONFIGURATION_DEBUG) || defined(LLDB_CONFIGURATION_RELEASE)
-
-class DIEStack {
-public:
-  void Push(const DWARFDIE &die) { m_dies.push_back(die); }
-
-  void LogDIEs(Log *log) {
-    StreamString log_strm;
-    const size_t n = m_dies.size();
-    log_strm.Printf("DIEStack[%" PRIu64 "]:\n", (uint64_t)n);
-    for (size_t i = 0; i < n; i++) {
-      std::string qualified_name;
-      const DWARFDIE &die = m_dies[i];
-      die.GetQualifiedName(qualified_name);
-      log_strm.Printf("[%" PRIu64 "] 0x%8.8x: %s name='%s'\n", (uint64_t)i,
-                      die.GetOffset(), die.GetTagAsCString(),
-                      qualified_name.c_str());
-    }
-    log->PutCString(log_strm.GetData());
-  }
-  void Pop() { m_dies.pop_back(); }
-
-  class ScopedPopper {
-  public:
-    ScopedPopper(DIEStack &die_stack)
-        : m_die_stack(die_stack), m_valid(false) {}
-
-    void Push(const DWARFDIE &die) {
-      m_valid = true;
-      m_die_stack.Push(die);
-    }
-
-    ~ScopedPopper() {
-      if (m_valid)
-        m_die_stack.Pop();
-    }
-
-  protected:
-    DIEStack &m_die_stack;
-    bool m_valid;
-  };
-
-protected:
-  typedef std::vector<DWARFDIE> Stack;
-  Stack m_dies;
-};
-#endif
-
 Function *DWARFASTParserClang::ParseFunctionFromDWARF(CompileUnit &comp_unit,
                                                       const DWARFDIE &die) {
   DWARFRangeList func_ranges;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
index 75647dbb082f2..454637ef981c7 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
@@ -24,8 +24,8 @@ class DWARFCompileUnit : public DWARFUnit {
   DWARFCompileUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
                    const DWARFUnitHeader &header,
                    const DWARFAbbreviationDeclarationSet &abbrevs,
-                   DIERef::Section section)
-      : DWARFUnit(dwarf, uid, header, abbrevs, section) {}
+                   DIERef::Section section, bool is_dwo)
+      : DWARFUnit(dwarf, uid, header, abbrevs, section, is_dwo) {}
 
   DISALLOW_COPY_AND_ASSIGN(DWARFCompileUnit);
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.h
index add0423840397..24baac90aa445 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.h
@@ -41,8 +41,6 @@ class DWARFContext {
   SectionData m_data_debug_str_offsets;
   SectionData m_data_debug_types;
 
-  bool isDwo() { return m_dwo_section_list != nullptr; }
-
   const DWARFDataExtractor &
   LoadOrGetSection(lldb::SectionType main_section_type,
                    llvm::Optional<lldb::SectionType> dwo_section_type,
@@ -67,6 +65,8 @@ class DWARFContext {
   const DWARFDataExtractor &getOrLoadStrOffsetsData();
   const DWARFDataExtractor &getOrLoadDebugTypesData();
 
+  bool isDwo() { return m_dwo_section_list != nullptr; }
+
   llvm::DWARFContext &GetAsLLVM();
 };
 } // namespace lldb_private
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 5612c59059bed..5b95912909ee9 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -344,7 +344,7 @@ bool DWARFDebugInfoEntry::GetDIENamesAndRanges(
               *frame_base = DWARFExpression(
                   module, DataExtractor(data, block_offset, block_length), cu);
             } else {
-              DataExtractor data = dwarf.DebugLocData();
+              DataExtractor data = cu->GetLocationData();
               const dw_offset_t offset = form_value.Unsigned();
               if (data.ValidOffset(offset)) {
                 data = DataExtractor(data, offset, data.GetByteSize() - offset);
@@ -478,8 +478,6 @@ void DWARFDebugInfoEntry::DumpAttribute(
 
   s.PutCString("( ");
 
-  SymbolFileDWARF &dwarf = cu->GetSymbolFileDWARF();
-
   // Check to see if we have any special attribute formatters
   switch (attr) {
   case DW_AT_stmt_list:
@@ -509,7 +507,7 @@ void DWARFDebugInfoEntry::DumpAttribute(
       // We have a location list offset as the value that is the offset into
       // the .debug_loc section that describes the value over it's lifetime
       uint64_t debug_loc_offset = form_value.Unsigned();
-      DWARFExpression::PrintDWARFLocationList(s, cu, dwarf.DebugLocData(),
+      DWARFExpression::PrintDWARFLocationList(s, cu, cu->GetLocationData(),
                                               debug_loc_offset);
     }
   } break;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
index 5cab4cef143e0..f660cc32b3f8d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
@@ -602,101 +602,6 @@ bool DWARFFormValue::IsDataForm(const dw_form_t form) {
   return false;
 }
 
-int DWARFFormValue::Compare(const DWARFFormValue &a_value,
-                            const DWARFFormValue &b_value) {
-  dw_form_t a_form = a_value.Form();
-  dw_form_t b_form = b_value.Form();
-  if (a_form < b_form)
-    return -1;
-  if (a_form > b_form)
-    return 1;
-  switch (a_form) {
-  case DW_FORM_addr:
-  case DW_FORM_addrx:
-  case DW_FORM_flag:
-  case DW_FORM_data1:
-  case DW_FORM_data2:
-  case DW_FORM_data4:
-  case DW_FORM_data8:
-  case DW_FORM_udata:
-  case DW_FORM_ref_addr:
-  case DW_FORM_sec_offset:
-  case DW_FORM_flag_present:
-  case DW_FORM_ref_sig8:
-  case DW_FORM_GNU_addr_index: {
-    uint64_t a = a_value.Unsigned();
-    uint64_t b = b_value.Unsigned();
-    if (a < b)
-      return -1;
-    if (a > b)
-      return 1;
-    return 0;
-  }
-
-  case DW_FORM_sdata: {
-    int64_t a = a_value.Signed();
-    int64_t b = b_value.Signed();
-    if (a < b)
-      return -1;
-    if (a > b)
-      return 1;
-    return 0;
-  }
-
-  case DW_FORM_string:
-  case DW_FORM_strp:
-  case DW_FORM_GNU_str_index: {
-    const char *a_string = a_value.AsCString();
-    const char *b_string = b_value.AsCString();
-    if (a_string == b_string)
-      return 0;
-    else if (a_string && b_string)
-      return strcmp(a_string, b_string);
-    else if (a_string == nullptr)
-      return -1; // A string is NULL, and B is valid
-    else
-      return 1; // A string valid, and B is NULL
-  }
-
-  case DW_FORM_block:
-  case DW_FORM_block1:
-  case DW_FORM_block2:
-  case DW_FORM_block4:
-  case DW_FORM_exprloc: {
-    uint64_t a_len = a_value.Unsigned();
-    uint64_t b_len = b_value.Unsigned();
-    if (a_len < b_len)
-      return -1;
-    if (a_len > b_len)
-      return 1;
-    // The block lengths are the same
-    return memcmp(a_value.BlockData(), b_value.BlockData(), a_value.Unsigned());
-  } break;
-
-  case DW_FORM_ref1:
-  case DW_FORM_ref2:
-  case DW_FORM_ref4:
-  case DW_FORM_ref8:
-  case DW_FORM_ref_udata: {
-    uint64_t a = a_value.m_value.value.uval;
-    uint64_t b = b_value.m_value.value.uval;
-    if (a < b)
-      return -1;
-    if (a > b)
-      return 1;
-    return 0;
-  }
-
-  case DW_FORM_indirect:
-    llvm_unreachable(
-        "This shouldn't happen after the form has been extracted...");
-
-  default:
-    llvm_unreachable("Unhandled DW_FORM");
-  }
-  return -1;
-}
-
 bool DWARFFormValue::FormIsSupported(dw_form_t form) {
   switch (form) {
     case DW_FORM_addr:
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
index 6ff73ecd8efa3..8967509c081aa 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
@@ -28,8 +28,8 @@ class DWARFTypeUnit : public DWARFUnit {
   DWARFTypeUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
                 const DWARFUnitHeader &header,
                 const DWARFAbbreviationDeclarationSet &abbrevs,
-                DIERef::Section section)
-      : DWARFUnit(dwarf, uid, header, abbrevs, section) {}
+                DIERef::Section section, bool is_dwo)
+      : DWARFUnit(dwarf, uid, header, abbrevs, section, is_dwo) {}
 
   friend class DWARFUnit;
 };
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index d8d70bae0232d..dcb38da3c43ee 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -32,9 +32,9 @@ extern int g_verbose;
 DWARFUnit::DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
                      const DWARFUnitHeader &header,
                      const DWARFAbbreviationDeclarationSet &abbrevs,
-                     DIERef::Section section)
+                     DIERef::Section section, bool is_dwo)
     : UserID(uid), m_dwarf(dwarf), m_header(header), m_abbrevs(&abbrevs),
-      m_cancel_scopes(false), m_section(section) {}
+      m_cancel_scopes(false), m_section(section), m_is_dwo(is_dwo) {}
 
 DWARFUnit::~DWARFUnit() = default;
 
@@ -336,6 +336,9 @@ void DWARFUnit::AddUnitDIE(const DWARFDebugInfoEntry &cu_die) {
     }
   }
 
+  if (m_is_dwo)
+    return;
+
   std::unique_ptr<SymbolFileDWARFDwo> dwo_symbol_file =
       m_dwarf.GetDwoSymbolFileForCompileUnit(*this, cu_die);
   if (!dwo_symbol_file)
@@ -459,6 +462,22 @@ void DWARFUnit::SetLoclistsBase(dw_addr_t loclists_base) {
   }
 }
 
+std::unique_ptr<llvm::DWARFLocationTable>
+DWARFUnit::GetLocationTable(const DataExtractor &data) const {
+  llvm::DWARFDataExtractor llvm_data(
+      toStringRef(data.GetData()),
+      data.GetByteOrder() == lldb::eByteOrderLittle, data.GetAddressByteSize());
+
+  if (m_is_dwo || GetVersion() >= 5)
+    return std::make_unique<llvm::DWARFDebugLoclists>(llvm_data, GetVersion());
+  return std::make_unique<llvm::DWARFDebugLoc>(llvm_data);
+}
+
+const DWARFDataExtractor &DWARFUnit::GetLocationData() const {
+  return GetVersion() >= 5 ? GetSymbolFileDWARF().get_debug_loclists_data()
+                           : GetSymbolFileDWARF().get_debug_loc_data();
+}
+
 void DWARFUnit::SetRangesBase(dw_addr_t ranges_base) {
   m_ranges_base = ranges_base;
 
@@ -872,11 +891,12 @@ DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
     return llvm::make_error<llvm::object::GenericBinaryError>(
         "No abbrev exists at the specified offset.");
 
+  bool is_dwo = dwarf.GetDWARFContext().isDwo();
   if (expected_header->IsTypeUnit())
-    return DWARFUnitSP(
-        new DWARFTypeUnit(dwarf, uid, *expected_header, *abbrevs, section));
-  return DWARFUnitSP(
-      new DWARFCompileUnit(dwarf, uid, *expected_header, *abbrevs, section));
+    return DWARFUnitSP(new DWARFTypeUnit(dwarf, uid, *expected_header, *abbrevs,
+                                         section, is_dwo));
+  return DWARFUnitSP(new DWARFCompileUnit(dwarf, uid, *expected_header,
+                                          *abbrevs, section, is_dwo));
 }
 
 const lldb_private::DWARFDataExtractor &DWARFUnit::GetData() const {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index d53ed756fe05d..6bee4ab8be8e7 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -243,11 +243,18 @@ class DWARFUnit : public lldb_private::UserID {
     return *Offset + m_loclists_base;
   }
 
+  /// Return the location table for parsing the given location list data. The
+  /// format is chosen according to the unit type. Never returns null.
+  std::unique_ptr<llvm::DWARFLocationTable>
+  GetLocationTable(const lldb_private::DataExtractor &data) const;
+
+  const lldb_private::DWARFDataExtractor &GetLocationData() const;
+
 protected:
   DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
             const DWARFUnitHeader &header,
             const DWARFAbbreviationDeclarationSet &abbrevs,
-            DIERef::Section section);
+            DIERef::Section section, bool is_dwo);
 
   llvm::Error ExtractHeader(SymbolFileDWARF &dwarf,
                             const lldb_private::DWARFDataExtractor &data,
@@ -314,6 +321,7 @@ class DWARFUnit : public lldb_private::UserID {
   llvm::Optional<llvm::DWARFListTableHeader> m_loclist_table_header;
 
   const DIERef::Section m_section;
+  bool m_is_dwo;
 
 private:
   void ParseProducerInfo();
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 53339ea31e71a..0792260e36fe5 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -597,13 +597,6 @@ void SymbolFileDWARF::LoadSectionData(lldb::SectionType sect_type,
   m_objfile_sp->ReadSectionData(section_sp.get(), data);
 }
 
-const DWARFDataExtractor &SymbolFileDWARF::DebugLocData() {
-  const DWARFDataExtractor &debugLocData = get_debug_loc_data();
-  if (debugLocData.GetByteSize() > 0)
-    return debugLocData;
-  return get_debug_loclists_data();
-}
-
 const DWARFDataExtractor &SymbolFileDWARF::get_debug_loc_data() {
   return GetCachedSectionData(eSectionTypeDWARFDebugLoc, m_data_debug_loc);
 }
@@ -3361,7 +3354,7 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc,
                   module, DataExtractor(data, block_offset, block_length),
                   die.GetCU());
             } else {
-              DataExtractor data = DebugLocData();
+              DataExtractor data = die.GetCU()->GetLocationData();
               dw_offset_t offset = form_value.Unsigned();
               if (form_value.Form() == DW_FORM_loclistx)
                 offset = die.GetCU()->GetLoclistOffset(offset).getValueOr(-1);
@@ -3978,13 +3971,6 @@ SymbolFileDWARFDebugMap *SymbolFileDWARF::GetDebugMapSymfile() {
   return m_debug_map_symfile;
 }
 
-DWARFExpression::LocationListFormat
-SymbolFileDWARF::GetLocationListFormat() const {
-  if (m_data_debug_loclists.m_data.GetByteSize() > 0)
-    return DWARFExpression::LocLists;
-  return DWARFExpression::RegularLocationList;
-}
-
 SymbolFileDWARFDwp *SymbolFileDWARF::GetDwpSymbolFile() {
   llvm::call_once(m_dwp_symfile_once_flag, [this]() {
     ModuleSpec module_spec;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index 35b18f4b02b35..f816dd77800e4 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -21,7 +21,6 @@
 
 #include "lldb/Core/UniqueCStringMap.h"
 #include "lldb/Core/dwarf.h"
-#include "lldb/Expression/DWARFExpression.h"
 #include "lldb/Symbol/DebugMacros.h"
 #include "lldb/Symbol/SymbolContext.h"
 #include "lldb/Symbol/SymbolFile.h"
@@ -236,8 +235,6 @@ class SymbolFileDWARF : public lldb_private::SymbolFile,
 
   DWARFDebugRanges *GetDebugRanges();
 
-  const lldb_private::DWARFDataExtractor &DebugLocData();
-
   static bool SupportedVersion(uint16_t version);
 
   DWARFDIE
@@ -260,9 +257,6 @@ class SymbolFileDWARF : public lldb_private::SymbolFile,
 
   virtual lldb::CompUnitSP ParseCompileUnit(DWARFCompileUnit &dwarf_cu);
 
-  virtual lldb_private::DWARFExpression::LocationListFormat
-  GetLocationListFormat() const;
-
   lldb::ModuleSP GetExternalModule(lldb_private::ConstString name);
 
   typedef std::map<lldb_private::ConstString, lldb::ModuleSP>
@@ -287,7 +281,7 @@ class SymbolFileDWARF : public lldb_private::SymbolFile,
 
   lldb::user_id_t GetUID(DIERef ref);
 
-  virtual std::unique_ptr<SymbolFileDWARFDwo>
+  std::unique_ptr<SymbolFileDWARFDwo>
   GetDwoSymbolFileForCompileUnit(DWARFUnit &dwarf_cu,
                                  const DWARFDebugInfoEntry &cu_die);
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
index 331417fe5cd12..f75f06f31e2da 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
@@ -137,13 +137,6 @@ SymbolFileDWARF &SymbolFileDWARFDwo::GetBaseSymbolFile() {
   return m_base_dwarf_cu.GetSymbolFileDWARF();
 }
 
-DWARFExpression::LocationListFormat
-SymbolFileDWARFDwo::GetLocationListFormat() const {
-  return m_base_dwarf_cu.GetVersion() >= 5
-             ? DWARFExpression::LocLists
-             : DWARFExpression::SplitDwarfLocationList;
-}
-
 llvm::Expected<TypeSystem &>
 SymbolFileDWARFDwo::GetTypeSystemForLanguage(LanguageType language) {
   return GetBaseSymbolFile().GetTypeSystemForLanguage(language);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
index d07209784dd7e..0855dba044e4f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
@@ -35,9 +35,6 @@ class SymbolFileDWARFDwo : public SymbolFileDWARF {
   DWARFUnit *
   GetDWARFCompileUnit(lldb_private::CompileUnit *comp_unit) override;
 
-  lldb_private::DWARFExpression::LocationListFormat
-  GetLocationListFormat() const override;
-
   size_t GetObjCMethodDIEOffsets(lldb_private::ConstString class_name,
                                  DIEArray &method_die_offsets) override;
 
@@ -47,12 +44,6 @@ class SymbolFileDWARFDwo : public SymbolFileDWARF {
   DWARFDIE
   GetDIE(const DIERef &die_ref) override;
 
-  std::unique_ptr<SymbolFileDWARFDwo>
-  GetDwoSymbolFileForCompileUnit(DWARFUnit &dwarf_cu,
-                                 const DWARFDebugInfoEntry &cu_die) override {
-    return nullptr;
-  }
-
   DWARFCompileUnit *GetBaseCompileUnit() override { return &m_base_dwarf_cu; }
 
   llvm::Optional<uint32_t> GetDwoNum() override { return GetID() >> 32; }
diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp
index c392317df0066..e92585ccfed70 100644
--- a/lldb/source/Symbol/Function.cpp
+++ b/lldb/source/Symbol/Function.cpp
@@ -76,16 +76,6 @@ InlineFunctionInfo::InlineFunctionInfo(ConstString name,
 
 InlineFunctionInfo::~InlineFunctionInfo() {}
 
-int InlineFunctionInfo::Compare(const InlineFunctionInfo &a,
-                                const InlineFunctionInfo &b) {
-
-  int result = FunctionInfo::Compare(a, b);
-  if (result)
-    return result;
-  // only compare the mangled names if both have them
-  return Mangled::Compare(a.m_mangled, a.m_mangled);
-}
-
 void InlineFunctionInfo::Dump(Stream *s, bool show_fullpaths) const {
   FunctionInfo::Dump(s, show_fullpaths);
   if (m_mangled)
diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp
index aeca76f7d05f8..f194356a0a079 100644
--- a/lldb/source/Symbol/Type.cpp
+++ b/lldb/source/Symbol/Type.cpp
@@ -656,17 +656,6 @@ CompilerType Type::GetForwardCompilerType() {
   return m_compiler_type;
 }
 
-int Type::Compare(const Type &a, const Type &b) {
-  // Just compare the UID values for now...
-  lldb::user_id_t a_uid = a.GetID();
-  lldb::user_id_t b_uid = b.GetID();
-  if (a_uid < b_uid)
-    return -1;
-  if (a_uid > b_uid)
-    return 1;
-  return 0;
-}
-
 ConstString Type::GetQualifiedName() {
   return GetForwardCompilerType().GetConstTypeName();
 }
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index a8fb32dafa898..6711dc37eca63 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -137,19 +137,12 @@ ProcessProperties::ProcessProperties(lldb_private::Process *process)
         Process::GetGlobalProperties().get());
     m_collection_sp->SetValueChangedCallback(
         ePropertyPythonOSPluginPath,
-        ProcessProperties::OptionValueChangedCallback, this);
+        [this] { m_process->LoadOperatingSystemPlugin(true); });
   }
 }
 
 ProcessProperties::~ProcessProperties() = default;
 
-void ProcessProperties::OptionValueChangedCallback(void *baton,
-                                                   OptionValue *option_value) {
-  ProcessProperties *properties = (ProcessProperties *)baton;
-  if (properties->m_process)
-    properties->m_process->LoadOperatingSystemPlugin(true);
-}
-
 bool ProcessProperties::GetDisableMemoryCache() const {
   const uint32_t idx = ePropertyDisableMemCache;
   return m_collection_sp->GetPropertyAtIndexAsBoolean(
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index e35a10a3f6bf8..83e6f3062666a 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -3461,29 +3461,24 @@ TargetProperties::TargetProperties(Target *target)
     // Set callbacks to update launch_info whenever "settins set" updated any
     // of these properties
     m_collection_sp->SetValueChangedCallback(
-        ePropertyArg0, TargetProperties::Arg0ValueChangedCallback, this);
+        ePropertyArg0, [this] { Arg0ValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyRunArgs, TargetProperties::RunArgsValueChangedCallback, this);
+        ePropertyRunArgs, [this] { RunArgsValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyEnvVars, TargetProperties::EnvVarsValueChangedCallback, this);
+        ePropertyEnvVars, [this] { EnvVarsValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyInputPath, TargetProperties::InputPathValueChangedCallback,
-        this);
+        ePropertyInputPath, [this] { InputPathValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyOutputPath, TargetProperties::OutputPathValueChangedCallback,
-        this);
+        ePropertyOutputPath, [this] { OutputPathValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyErrorPath, TargetProperties::ErrorPathValueChangedCallback,
-        this);
+        ePropertyErrorPath, [this] { ErrorPathValueChangedCallback(); });
+    m_collection_sp->SetValueChangedCallback(ePropertyDetachOnError, [this] {
+      DetachOnErrorValueChangedCallback();
+    });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyDetachOnError,
-        TargetProperties::DetachOnErrorValueChangedCallback, this);
+        ePropertyDisableASLR, [this] { DisableASLRValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyDisableASLR, TargetProperties::DisableASLRValueChangedCallback,
-        this);
-    m_collection_sp->SetValueChangedCallback(
-        ePropertyDisableSTDIO,
-        TargetProperties::DisableSTDIOValueChangedCallback, this);
+        ePropertyDisableSTDIO, [this] { DisableSTDIOValueChangedCallback(); });
 
     m_experimental_properties_up.reset(new TargetExperimentalProperties());
     m_collection_sp->AppendProperty(
@@ -3493,16 +3488,16 @@ TargetProperties::TargetProperties(Target *target)
         true, m_experimental_properties_up->GetValueProperties());
 
     // Update m_launch_info once it was created
-    Arg0ValueChangedCallback(this, nullptr);
-    RunArgsValueChangedCallback(this, nullptr);
-    // EnvVarsValueChangedCallback(this, nullptr); // FIXME: cause segfault in
+    Arg0ValueChangedCallback();
+    RunArgsValueChangedCallback();
+    // EnvVarsValueChangedCallback(); // FIXME: cause segfault in
     // Target::GetPlatform()
-    InputPathValueChangedCallback(this, nullptr);
-    OutputPathValueChangedCallback(this, nullptr);
-    ErrorPathValueChangedCallback(this, nullptr);
-    DetachOnErrorValueChangedCallback(this, nullptr);
-    DisableASLRValueChangedCallback(this, nullptr);
-    DisableSTDIOValueChangedCallback(this, nullptr);
+    InputPathValueChangedCallback();
+    OutputPathValueChangedCallback();
+    ErrorPathValueChangedCallback();
+    DetachOnErrorValueChangedCallback();
+    DisableASLRValueChangedCallback();
+    DisableSTDIOValueChangedCallback();
   } else {
     m_collection_sp =
         std::make_shared<TargetOptionValueProperties>(ConstString("target"));
@@ -3975,81 +3970,54 @@ void TargetProperties::SetRequireHardwareBreakpoints(bool b) {
   m_collection_sp->SetPropertyAtIndexAsBoolean(nullptr, idx, b);
 }
 
-void TargetProperties::Arg0ValueChangedCallback(void *target_property_ptr,
-                                                OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  this_->m_launch_info.SetArg0(this_->GetArg0());
+void TargetProperties::Arg0ValueChangedCallback() {
+  m_launch_info.SetArg0(GetArg0());
 }
 
-void TargetProperties::RunArgsValueChangedCallback(void *target_property_ptr,
-                                                   OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
+void TargetProperties::RunArgsValueChangedCallback() {
   Args args;
-  if (this_->GetRunArguments(args))
-    this_->m_launch_info.GetArguments() = args;
+  if (GetRunArguments(args))
+    m_launch_info.GetArguments() = args;
 }
 
-void TargetProperties::EnvVarsValueChangedCallback(void *target_property_ptr,
-                                                   OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  this_->m_launch_info.GetEnvironment() = this_->GetEnvironment();
+void TargetProperties::EnvVarsValueChangedCallback() {
+  m_launch_info.GetEnvironment() = GetEnvironment();
 }
 
-void TargetProperties::InputPathValueChangedCallback(void *target_property_ptr,
-                                                     OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  this_->m_launch_info.AppendOpenFileAction(
-      STDIN_FILENO, this_->GetStandardInputPath(), true, false);
+void TargetProperties::InputPathValueChangedCallback() {
+  m_launch_info.AppendOpenFileAction(STDIN_FILENO, GetStandardInputPath(), true,
+                                     false);
 }
 
-void TargetProperties::OutputPathValueChangedCallback(void *target_property_ptr,
-                                                      OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  this_->m_launch_info.AppendOpenFileAction(
-      STDOUT_FILENO, this_->GetStandardOutputPath(), false, true);
+void TargetProperties::OutputPathValueChangedCallback() {
+  m_launch_info.AppendOpenFileAction(STDOUT_FILENO, GetStandardOutputPath(),
+                                     false, true);
 }
 
-void TargetProperties::ErrorPathValueChangedCallback(void *target_property_ptr,
-                                                     OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  this_->m_launch_info.AppendOpenFileAction(
-      STDERR_FILENO, this_->GetStandardErrorPath(), false, true);
+void TargetProperties::ErrorPathValueChangedCallback() {
+  m_launch_info.AppendOpenFileAction(STDERR_FILENO, GetStandardErrorPath(),
+                                     false, true);
 }
 
-void TargetProperties::DetachOnErrorValueChangedCallback(
-    void *target_property_ptr, OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  if (this_->GetDetachOnError())
-    this_->m_launch_info.GetFlags().Set(lldb::eLaunchFlagDetachOnError);
+void TargetProperties::DetachOnErrorValueChangedCallback() {
+  if (GetDetachOnError())
+    m_launch_info.GetFlags().Set(lldb::eLaunchFlagDetachOnError);
   else
-    this_->m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDetachOnError);
+    m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDetachOnError);
 }
 
-void TargetProperties::DisableASLRValueChangedCallback(
-    void *target_property_ptr, OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  if (this_->GetDisableASLR())
-    this_->m_launch_info.GetFlags().Set(lldb::eLaunchFlagDisableASLR);
+void TargetProperties::DisableASLRValueChangedCallback() {
+  if (GetDisableASLR())
+    m_launch_info.GetFlags().Set(lldb::eLaunchFlagDisableASLR);
   else
-    this_->m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDisableASLR);
+    m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDisableASLR);
 }
 
-void TargetProperties::DisableSTDIOValueChangedCallback(
-    void *target_property_ptr, OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  if (this_->GetDisableSTDIO())
-    this_->m_launch_info.GetFlags().Set(lldb::eLaunchFlagDisableSTDIO);
+void TargetProperties::DisableSTDIOValueChangedCallback() {
+  if (GetDisableSTDIO())
+    m_launch_info.GetFlags().Set(lldb::eLaunchFlagDisableSTDIO);
   else
-    this_->m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDisableSTDIO);
+    m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDisableSTDIO);
 }
 
 // Target::TargetEventData
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/independent_state.in b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/independent_state.in
new file mode 100644
index 0000000000000..6e15a8ff663ec
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/independent_state.in
@@ -0,0 +1,6 @@
+script foobar = 40 + 7
+script print(foobar)
+script d = lldb.SBDebugger.Create()
+script d:HandleCommand("script foobar = 40 + 2")
+script print(foobar)
+script d:HandleCommand("script print(foobar)")
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions.in b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions.in
new file mode 100644
index 0000000000000..75c57e364cac7
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions.in
@@ -0,0 +1,6 @@
+script
+print(lldb.target, lldb.debugger:GetSelectedTarget())
+lldb.debugger:SetSelectedTarget(lldb.debugger:GetTargetAtIndex(0))
+print(lldb.target, lldb.debugger:GetSelectedTarget())
+lldb.debugger:HandleCommand("script print(lldb.target, lldb.debugger:GetSelectedTarget())")
+print(lldb.target, lldb.debugger:GetSelectedTarget())
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions_2.in b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions_2.in
new file mode 100644
index 0000000000000..a8cc2a57a55b3
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions_2.in
@@ -0,0 +1,2 @@
+script
+print(lldb.target, lldb.debugger:GetSelectedTarget())
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/convenience_variables.test b/lldb/test/Shell/ScriptInterpreter/Lua/convenience_variables.test
new file mode 100644
index 0000000000000..022f2e38db49a
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/convenience_variables.test
@@ -0,0 +1,17 @@
+# REQUIRES: lua
+#
+# This tests that the convenience variables are not nil. Given that there is no
+# target we only expect the debugger to be valid.
+#
+# RUN: cat %s | %lldb --script-language lua 2>&1 | FileCheck %s
+script
+print(string.format("lldb.debugger is valid: %s", lldb.debugger:IsValid()))
+print(string.format("lldb.target is valid: %s", lldb.target:IsValid()))
+print(string.format("lldb.process is valid: %s", lldb.process:IsValid()))
+print(string.format("lldb.thread is valid: %s", lldb.thread:IsValid()))
+print(string.format("lldb.frame is valid: %s", lldb.frame:IsValid()))
+# CHECK: debugger is valid: true
+# CHECK: target is valid: false
+# CHECK: process is valid: false
+# CHECK: thread is valid: false
+# CHECK: frame is valid: false
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/independent_state.test b/lldb/test/Shell/ScriptInterpreter/Lua/independent_state.test
new file mode 100644
index 0000000000000..2ade1b91c1cb6
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/independent_state.test
@@ -0,0 +1,6 @@
+# REQUIRES: lua
+#
+# RUN:  %lldb --script-language lua -s %S/Inputs/independent_state.in 2>&1 | FileCheck %s
+# CHECK: 47
+# CHECK: 47
+# CHECK: 42
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/nested_sessions.test b/lldb/test/Shell/ScriptInterpreter/Lua/nested_sessions.test
new file mode 100644
index 0000000000000..a81418b6af61d
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/nested_sessions.test
@@ -0,0 +1,12 @@
+# REQUIRES: lua
+# RUN: mkdir -p %t
+# RUN: echo "int main() { return 0; }" | %clang_host -x c - -o %t/foo
+# RUN: echo "int main() { return 0; }" | %clang_host -x c - -o %t/bar
+# RUN:  %lldb --script-language lua -o "file %t/bar" -o "file %t/foo" -s %S/Inputs/nested_sessions.in  -s %S/Inputs/nested_sessions_2.in 2>&1 | FileCheck %s
+# CHECK: script
+# CHECK-NEXT: foo foo
+# CHECK-NEXT: foo bar
+# CHECK-NEXT: foo bar
+# CHECK-NEXT: foo bar
+# CHECK: script
+# CHECK-NEXT: bar bar
diff --git a/lldb/test/Shell/SymbolFile/DWARF/debug_loc_and_loclists.s b/lldb/test/Shell/SymbolFile/DWARF/debug_loc_and_loclists.s
new file mode 100644
index 0000000000000..05bccbe78aabf
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/debug_loc_and_loclists.s
@@ -0,0 +1,154 @@
+# Test that we can handle DWARF 4 and 5 location lists in the same object file
+# (but different compile units).
+
+# REQUIRES: x86
+
+# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t
+# RUN: %lldb %t -o "image lookup -v -s loc" -o "image lookup -v -s loclists" \
+# RUN:   -o exit | FileCheck %s
+
+
+# CHECK-LABEL: image lookup -v -s loc
+# CHECK: Variable: {{.*}}, name = "x0", type = "int", location = DW_OP_reg5 RDI,
+
+# CHECK-LABEL: image lookup -v -s loclists
+# CHECK: Variable: {{.*}}, name = "x1", type = "int", location = DW_OP_reg0 RAX,
+
+
+loc:
+        nop
+.Lloc_end:
+
+loclists:
+        nop
+.Lloclists_end:
+
+        .section        .debug_loc,"",@progbits
+.Lloc_list:
+        .quad loc-loc
+        .quad .Lloc_end-loc
+        .short 1
+        .byte   85                      # super-register DW_OP_reg5
+        .quad 0
+        .quad 0
+
+        .section        .debug_loclists,"",@progbits
+        .long   .Ldebug_loclist_table_end0-.Ldebug_loclist_table_start0 # Length
+.Ldebug_loclist_table_start0:
+        .short  5                       # Version
+        .byte   8                       # Address size
+        .byte   0                       # Segment selector size
+        .long   0                       # Offset entry count
+
+.Lloclists_list:
+        .byte   4                       # DW_LLE_offset_pair
+        .uleb128 loclists-loclists
+        .uleb128 .Lloclists_end-loclists
+        .uleb128 1
+        .byte   80                      # super-register DW_OP_reg0
+        .byte   0                       # DW_LLE_end_of_list
+.Ldebug_loclist_table_end0:
+
+        .section        .debug_abbrev,"",@progbits
+        .byte   1                       # Abbreviation Code
+        .byte   17                      # DW_TAG_compile_unit
+        .byte   1                       # DW_CHILDREN_yes
+        .byte   37                      # DW_AT_producer
+        .byte   8                       # DW_FORM_string
+        .byte   19                      # DW_AT_language
+        .byte   5                       # DW_FORM_data2
+        .byte   17                      # DW_AT_low_pc
+        .byte   1                       # DW_FORM_addr
+        .byte   18                      # DW_AT_high_pc
+        .byte   6                       # DW_FORM_data4
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   2                       # Abbreviation Code
+        .byte   46                      # DW_TAG_subprogram
+        .byte   1                       # DW_CHILDREN_yes
+        .byte   17                      # DW_AT_low_pc
+        .byte   1                       # DW_FORM_addr
+        .byte   18                      # DW_AT_high_pc
+        .byte   6                       # DW_FORM_data4
+        .byte   3                       # DW_AT_name
+        .byte   8                       # DW_FORM_string
+        .byte   73                      # DW_AT_type
+        .byte   16                      # DW_FORM_ref_addr
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   3                       # Abbreviation Code
+        .byte   5                       # DW_TAG_formal_parameter
+        .byte   0                       # DW_CHILDREN_no
+        .byte   2                       # DW_AT_location
+        .byte   23                      # DW_FORM_sec_offset
+        .byte   3                       # DW_AT_name
+        .byte   8                       # DW_FORM_string
+        .byte   73                      # DW_AT_type
+        .byte   16                      # DW_FORM_ref_addr
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   4                       # Abbreviation Code
+        .byte   36                      # DW_TAG_base_type
+        .byte   0                       # DW_CHILDREN_no
+        .byte   3                       # DW_AT_name
+        .byte   8                       # DW_FORM_string
+        .byte   62                      # DW_AT_encoding
+        .byte   11                      # DW_FORM_data1
+        .byte   11                      # DW_AT_byte_size
+        .byte   11                      # DW_FORM_data1
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   0                       # EOM(3)
+
+        .section        .debug_info,"",@progbits
+        .long   .Lloc_cu_end-.Lloc_cu_start # Length of Unit
+.Lloc_cu_start:
+        .short  4                       # DWARF version number
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .byte   8                       # Address Size (in bytes)
+        .byte   1                       # Abbrev [1] 0xb:0x50 DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .short  12                      # DW_AT_language
+        .quad   loc                     # DW_AT_low_pc
+        .long   .Lloc_end-loc           # DW_AT_high_pc
+        .byte   2                       # Abbrev [2] 0x2a:0x29 DW_TAG_subprogram
+        .quad   loc                     # DW_AT_low_pc
+        .long   .Lloc_end-loc           # DW_AT_high_pc
+        .asciz  "loc"                   # DW_AT_name
+        .long   .Lint                   # DW_AT_type
+        .byte   3                       # Abbrev [3] DW_TAG_formal_parameter
+        .long   .Lloc_list              # DW_AT_location
+        .asciz  "x0"                    # DW_AT_name
+        .long   .Lint                   # DW_AT_type
+        .byte   0                       # End Of Children Mark
+.Lint:
+        .byte   4                       # Abbrev [4] 0x53:0x7 DW_TAG_base_type
+        .asciz  "int"                   # DW_AT_name
+        .byte   5                       # DW_AT_encoding
+        .byte   4                       # DW_AT_byte_size
+        .byte   0                       # End Of Children Mark
+.Lloc_cu_end:
+
+        .long   .Lloclists_cu_end-.Lloclists_cu_start # Length of Unit
+.Lloclists_cu_start:
+        .short  5                       # DWARF version number
+        .byte   1                       # DWARF Unit Type
+        .byte   8                       # Address Size (in bytes)
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .byte   1                       # Abbrev [1] 0xb:0x50 DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .short  12                      # DW_AT_language
+        .quad   loclists                # DW_AT_low_pc
+        .long   .Lloclists_end-loclists # DW_AT_high_pc
+        .byte   2                       # Abbrev [2] 0x2a:0x29 DW_TAG_subprogram
+        .quad   loclists                # DW_AT_low_pc
+        .long   .Lloclists_end-loclists # DW_AT_high_pc
+        .asciz  "loclists"              # DW_AT_name
+        .long   .Lint                   # DW_AT_type
+        .byte   3                       # Abbrev [3] DW_TAG_formal_parameter
+        .long   .Lloclists_list         # DW_AT_location
+        .asciz  "x1"                    # DW_AT_name
+        .long   .Lint                   # DW_AT_type
+        .byte   0                       # End Of Children Mark
+        .byte   0                       # End Of Children Mark
+.Lloclists_cu_end:
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
index 2e952d6ad0bc7..40facdfb5cf9b 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
@@ -86,7 +86,7 @@ static CFStringRef CopyBundleIDForPath(const char *app_bundle_path,
 #if defined(WITH_BKS) || defined(WITH_FBS)
 #import <Foundation/Foundation.h>
 static const int OPEN_APPLICATION_TIMEOUT_ERROR = 111;
-typedef void (*SetErrorFunction)(NSInteger, DNBError &);
+typedef void (*SetErrorFunction)(NSInteger, std::string, DNBError &);
 typedef bool (*CallOpenApplicationFunction)(NSString *bundleIDNSStr,
                                             NSDictionary *options,
                                             DNBError &error, pid_t *return_pid);
@@ -122,6 +122,7 @@ static bool CallBoardSystemServiceOpenApplication(NSString *bundleIDNSStr,
   mach_port_t client_port = [system_service createClientPort];
   __block dispatch_semaphore_t semaphore = dispatch_semaphore_create(0);
   __block ErrorFlavor open_app_error = no_error_enum_value;
+  __block std::string open_app_error_string;
   bool wants_pid = (return_pid != NULL);
   __block pid_t pid_in_block;
 
@@ -159,6 +160,9 @@ static bool CallBoardSystemServiceOpenApplication(NSString *bundleIDNSStr,
              } else {
                const char *error_str =
                    [(NSString *)[bks_error localizedDescription] UTF8String];
+               if (error_str) {
+                 open_app_error_string = error_str;
+               }
                DNBLogThreadedIf(LOG_PROCESS, "In completion handler for send "
                                              "event, got error \"%s\"(%ld).",
                                 error_str ? error_str : "<unknown error>",
@@ -190,7 +194,7 @@ static bool CallBoardSystemServiceOpenApplication(NSString *bundleIDNSStr,
     error.SetError(OPEN_APPLICATION_TIMEOUT_ERROR, DNBError::Generic);
     error.SetErrorString("timed out trying to launch app");
   } else if (open_app_error != no_error_enum_value) {
-    error_function(open_app_error, error);
+    error_function(open_app_error, open_app_error_string, error);
     DNBLogError("unable to launch the application with CFBundleIdentifier '%s' "
                 "bks_error = %u",
                 cstr, open_app_error);
@@ -245,19 +249,19 @@ static bool IsBKSProcess(nub_process_t pid) {
   return app_state != BKSApplicationStateUnknown;
 }
 
-static void SetBKSError(NSInteger error_code, DNBError &error) {
+static void SetBKSError(NSInteger error_code, 
+                        std::string error_description, 
+                        DNBError &error) {
   error.SetError(error_code, DNBError::BackBoard);
   NSString *err_nsstr = ::BKSOpenApplicationErrorCodeToString(
       (BKSOpenApplicationErrorCode)error_code);
-  const char *err_str = NULL;
-  if (err_nsstr == NULL)
-    err_str = "unknown BKS error";
-  else {
+  std::string err_str = "unknown BKS error";
+  if (error_description.empty() == false) {
+    err_str = error_description;
+  } else if (err_nsstr != nullptr) {
     err_str = [err_nsstr UTF8String];
-    if (err_str == NULL)
-      err_str = "unknown BKS error";
   }
-  error.SetErrorString(err_str);
+  error.SetErrorString(err_str.c_str());
 }
 
 static bool BKSAddEventDataToOptions(NSMutableDictionary *options,
@@ -355,19 +359,19 @@ static bool IsFBSProcess(nub_process_t pid) {
 }
 #endif
 
-static void SetFBSError(NSInteger error_code, DNBError &error) {
+static void SetFBSError(NSInteger error_code, 
+                        std::string error_description, 
+                        DNBError &error) {
   error.SetError((DNBError::ValueType)error_code, DNBError::FrontBoard);
   NSString *err_nsstr = ::FBSOpenApplicationErrorCodeToString(
       (FBSOpenApplicationErrorCode)error_code);
-  const char *err_str = NULL;
-  if (err_nsstr == NULL)
-    err_str = "unknown FBS error";
-  else {
+  std::string err_str = "unknown FBS error";
+  if (error_description.empty() == false) {
+    err_str = error_description;
+  } else if (err_nsstr != nullptr) {
     err_str = [err_nsstr UTF8String];
-    if (err_str == NULL)
-      err_str = "unknown FBS error";
   }
-  error.SetErrorString(err_str);
+  error.SetErrorString(err_str.c_str());
 }
 
 static bool FBSAddEventDataToOptions(NSMutableDictionary *options,
@@ -2754,7 +2758,8 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options,
           "debugserver timed out waiting for openApplication to complete.");
       attach_err.SetError(OPEN_APPLICATION_TIMEOUT_ERROR, DNBError::Generic);
     } else if (attach_error_code != FBSOpenApplicationErrorCodeNone) {
-      SetFBSError(attach_error_code, attach_err);
+      std::string empty_str;
+      SetFBSError(attach_error_code, empty_str, attach_err);
       DNBLogError("unable to launch the application with CFBundleIdentifier "
                   "'%s' bks_error = %ld",
                   bundleIDStr.c_str(), (NSInteger)attach_error_code);
@@ -2831,7 +2836,8 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options,
           "debugserver timed out waiting for openApplication to complete.");
       attach_err.SetError(OPEN_APPLICATION_TIMEOUT_ERROR, DNBError::Generic);
     } else if (attach_error_code != BKSOpenApplicationErrorCodeNone) {
-      SetBKSError(attach_error_code, attach_err);
+      std::string empty_str;
+      SetBKSError(attach_error_code, empty_str, attach_err);
       DNBLogError("unable to launch the application with CFBundleIdentifier "
                   "'%s' bks_error = %ld",
                   bundleIDStr.c_str(), attach_error_code);
diff --git a/llvm/CODE_OWNERS.TXT b/llvm/CODE_OWNERS.TXT
index df8aa0b4ef9d2..457dabe39f90c 100644
--- a/llvm/CODE_OWNERS.TXT
+++ b/llvm/CODE_OWNERS.TXT
@@ -150,6 +150,10 @@ N: Dylan McKay
 E: me@dylanmckay.io
 D: AVR Backend
 
+N: Simon Moll
+E: simon.moll@emea.nec.com
+D: VE Backend
+
 N: Tim Northover
 E: t.p.northover@gmail.com
 D: AArch64 backend, misc ARM backend
diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h
index e87a08f7effff..9bfaaccd953e2 100644
--- a/llvm/include/llvm/ADT/StringRef.h
+++ b/llvm/include/llvm/ADT/StringRef.h
@@ -566,7 +566,8 @@ namespace llvm {
     ///
     /// If \p AllowInexact is false, the function will fail if the string
     /// cannot be represented exactly.  Otherwise, the function only fails
-    /// in case of an overflow or underflow.
+    /// in case of an overflow or underflow, or an invalid floating point
+    /// representation.
     bool getAsDouble(double &Result, bool AllowInexact = true) const;
 
     /// @}
diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h
index e9e57b6e92208..62b905f65bd74 100644
--- a/llvm/include/llvm/ADT/Triple.h
+++ b/llvm/include/llvm/ADT/Triple.h
@@ -98,7 +98,8 @@ class Triple {
     fpga_aoco,      // Intel FPGA: unlinked object file
     fpga_aocr,      // Intel FPGA: linked early image
     fpga_aocx,      // Intel FPGA: linked image
-    LastArchType = renderscript64
+    ve,             // NEC SX-Aurora Vector Engine
+    LastArchType = ve
   };
   enum SubArchType {
     NoSubArch,
@@ -135,7 +136,9 @@ class Triple {
 
     SPIRSubArch_fpga,
     SPIRSubArch_gen,
-    SPIRSubArch_x86_64
+    SPIRSubArch_x86_64,
+
+    PPCSubArch_spe
   };
   enum VendorType {
     UnknownVendor,
@@ -746,6 +749,11 @@ class Triple {
     return getArch() == Triple::x86 || getArch() == Triple::x86_64;
   }
 
+  /// Tests whether the target is VE
+  bool isVE() const {
+    return getArch() == Triple::ve;
+  }
+
   /// Tests whether the target supports comdat
   bool supportsCOMDAT() const {
     return !isOSBinFormatMachO();
diff --git a/llvm/include/llvm/CodeGen/LivePhysRegs.h b/llvm/include/llvm/CodeGen/LivePhysRegs.h
index 50da0b3d5c483..085893462a083 100644
--- a/llvm/include/llvm/CodeGen/LivePhysRegs.h
+++ b/llvm/include/llvm/CodeGen/LivePhysRegs.h
@@ -137,6 +137,9 @@ class LivePhysRegs {
   /// Live out registers are the union of the live-in registers of the successor
   /// blocks and pristine registers. Live out registers of the end block are the
   /// callee saved registers.
+  /// If a register is not added by this method, it is guaranteed to not be
+  /// live out from MBB, although a sub-register may be. This is true
+  /// both before and after regalloc.
   void addLiveOuts(const MachineBasicBlock &MBB);
 
   /// Adds all live-out registers of basic block \p MBB but skips pristine
diff --git a/llvm/include/llvm/CodeGen/MIRFormatter.h b/llvm/include/llvm/CodeGen/MIRFormatter.h
new file mode 100644
index 0000000000000..e57c32c5ae614
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MIRFormatter.h
@@ -0,0 +1,83 @@
+//===-- llvm/CodeGen/MIRFormatter.h -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MIRFormatter class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MIRFORMATTER_H
+#define LLVM_CODEGEN_MIRFORMATTER_H
+
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+struct PerFunctionMIParsingState;
+struct SlotMapping;
+
+/// MIRFormater - Interface to format MIR operand based on target
+class MIRFormatter {
+public:
+  typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
+      ErrorCallbackType;
+
+  MIRFormatter() {}
+  virtual ~MIRFormatter() = default;
+
+  /// Implement target specific printing for machine operand immediate value, so
+  /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
+  /// None to OpIdx means the index is unknown.
+  virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
+                        Optional<unsigned> OpIdx, int64_t Imm) const {
+    OS << Imm;
+  }
+
+  /// Implement target specific parsing of immediate mnemonics. The mnemonic is
+  /// dot seperated strings.
+  virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                                StringRef Src, int64_t &Imm,
+                                ErrorCallbackType ErrorCallback) const {
+    llvm_unreachable("target did not implement parsing MIR immediate mnemonic");
+  }
+
+  /// Implement target specific printing of target custom pseudo source value.
+  /// Default implementation is not necessarily the correct MIR serialization
+  /// format.
+  virtual void
+  printCustomPseudoSourceValue(raw_ostream &OS, ModuleSlotTracker &MST,
+                               const PseudoSourceValue &PSV) const {
+    PSV.printCustom(OS);
+  }
+
+  /// Implement target specific parsing of target custom pseudo source value.
+  virtual bool parseCustomPseudoSourceValue(
+      StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
+      const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
+    llvm_unreachable(
+        "target did not implement parsing MIR custom pseudo source value");
+  }
+
+  /// Helper functions to print IR value as MIR serialization format which will
+  /// be useful for target specific printer, e.g. for printing IR value in
+  /// custom pseudo source value.
+  static void printIRValue(raw_ostream &OS, const Value &V,
+                           ModuleSlotTracker &MST);
+
+  /// Helper functions to parse IR value from MIR serialization format which
+  /// will be useful for target specific parser, e.g. for parsing IR value for
+  /// custom pseudo source value.
+  static bool parseIRValue(StringRef Src, MachineFunction &MF,
+                           PerFunctionMIParsingState &PFS, const Value *&V,
+                           ErrorCallbackType ErrorCallback);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 4e32a04551c1c..8ca665b23b280 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -171,12 +171,16 @@ struct PerFunctionMIParsingState {
   DenseMap<unsigned, unsigned> ConstantPoolSlots;
   DenseMap<unsigned, unsigned> JumpTableSlots;
 
+  /// Maps from slot numbers to function's unnamed values.
+  DenseMap<unsigned, const Value *> Slots2Values;
+
   PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM,
                             const SlotMapping &IRSlots,
                             PerTargetMIParsingState &Target);
 
   VRegInfo &getVRegInfo(unsigned Num);
   VRegInfo &getVRegInfoNamed(StringRef RegName);
+  const Value *getIRValue(unsigned Slot);
 };
 
 /// Parse the machine basic block definitions, and skip the machine
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index 7ee700c62a25c..b0243646b06c7 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -26,6 +26,7 @@ namespace llvm {
 
 class FoldingSetNodeID;
 class MDNode;
+class MIRFormatter;
 class raw_ostream;
 class MachineFunction;
 class ModuleSlotTracker;
@@ -295,7 +296,8 @@ class MachineMemOperand {
   /// @{
   void print(raw_ostream &OS, ModuleSlotTracker &MST,
              SmallVectorImpl<StringRef> &SSNs, const LLVMContext &Context,
-             const MachineFrameInfo *MFI, const TargetInstrInfo *TII) const;
+             const MachineFrameInfo *MFI, const TargetInstrInfo *TII,
+             const MIRFormatter *MIRF) const;
   /// @}
 
   friend bool operator==(const MachineMemOperand &LHS,
diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h
index df914dc2d85e8..4222c03b023a9 100644
--- a/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -278,6 +278,9 @@ class MachineOperand {
   /// More complex way of printing a MachineOperand.
   /// \param TypeToPrint specifies the generic type to be printed on uses and
   /// defs. It can be determined using MachineInstr::getTypeToPrint.
+  /// \param OpIdx - specifies the index of the operand in machine instruction.
+  /// This will be used by target dependent MIR formatter. Could be None if the
+  /// index is unknown, e.g. called by dump().
   /// \param PrintDef - whether we want to print `def` on an operand which
   /// isDef. Sometimes, if the operand is printed before '=', we don't print
   /// `def`.
@@ -294,8 +297,9 @@ class MachineOperand {
   /// information from it's parent.
   /// \param IntrinsicInfo - same as \p TRI.
   void print(raw_ostream &os, ModuleSlotTracker &MST, LLT TypeToPrint,
-             bool PrintDef, bool IsStandalone, bool ShouldPrintRegisterTies,
-             unsigned TiedOperandIdx, const TargetRegisterInfo *TRI,
+             Optional<unsigned> OpIdx, bool PrintDef, bool IsStandalone,
+             bool ShouldPrintRegisterTies, unsigned TiedOperandIdx,
+             const TargetRegisterInfo *TRI,
              const TargetIntrinsicInfo *IntrinsicInfo) const;
 
   /// Same as print(os, TRI, IntrinsicInfo), but allows to specify the low-level
diff --git a/llvm/include/llvm/CodeGen/PseudoSourceValue.h b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
index 4b3cc9145a134..593a865ea5458 100644
--- a/llvm/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
@@ -22,6 +22,7 @@ namespace llvm {
 
 class MachineFrameInfo;
 class MachineMemOperand;
+class MIRFormatter;
 class raw_ostream;
 class TargetInstrInfo;
 
@@ -52,6 +53,7 @@ class PseudoSourceValue {
                                        const PseudoSourceValue* PSV);
 
   friend class MachineMemOperand; // For printCustom().
+  friend class MIRFormatter;      // For printCustom().
 
   /// Implement printing for PseudoSourceValue. This is called from
   /// Value::print or Value's operator<<.
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index a0beee36c7484..c7d4c4d7e5d44 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -309,13 +309,6 @@ class TargetFrameLowering {
                                              RegScavenger *RS = nullptr) const {
   }
 
-  /// processFunctionBeforeFrameIndicesReplaced - This method is called
-  /// immediately before MO_FrameIndex operands are eliminated, but after the
-  /// frame is finalized. This method is optional.
-  virtual void
-  processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
-                                            RegScavenger *RS = nullptr) const {}
-
   virtual unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const {
     report_fatal_error("WinEH not implemented for this target");
   }
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 4b4cea30b2ba3..e410d1c4806d3 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/None.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -1807,6 +1808,7 @@ class TargetInstrInfo : public MCInstrInfo {
                                                          Register Reg) const;
 
 private:
+  mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
   unsigned CatchRetOpcode;
   unsigned ReturnOpcode;
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 35d40c67b44f2..518ad7079225e 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -773,6 +773,9 @@ class Neon_Dot_Intrinsic
 def int_arm_neon_udot : Neon_Dot_Intrinsic;
 def int_arm_neon_sdot : Neon_Dot_Intrinsic;
 
+def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+
 def int_arm_mve_vctp8  : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_mve_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_mve_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
@@ -881,7 +884,7 @@ defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty],
 
 multiclass MVEPredicated<list<LLVMType> rets, list<LLVMType> params,
                          LLVMType pred = llvm_anyvector_ty,
-                         list<IntrinsicProperty> props = []> {
+                         list<IntrinsicProperty> props = [IntrNoMem]> {
   def "": Intrinsic<rets, params, props>;
   def _predicated: Intrinsic<rets, params # [pred], props>;
 }
@@ -895,7 +898,7 @@ multiclass MVEPredicatedM<list<LLVMType> rets, list<LLVMType> params,
 }
 
 defm int_arm_mve_vcvt_narrow: MVEPredicated<[llvm_v8f16_ty],
-   [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty, [IntrNoMem]>;
+   [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty>;
 
 defm int_arm_mve_vldr_gather_base: MVEPredicated<
    [llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
@@ -1033,7 +1036,7 @@ def int_arm_mve_vmull_poly: Intrinsic<
 multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
                            list<LLVMType> params, LLVMType inactive,
                            LLVMType predicate,
-                           list<IntrinsicProperty> props = []> {
+                           list<IntrinsicProperty> props = [IntrNoMem]> {
   def "":          Intrinsic<rets, flags # params, props>;
   def _predicated: Intrinsic<rets, flags # [inactive] # params # [predicate],
                              props>;
@@ -1047,7 +1050,7 @@ multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
 defm int_arm_mve_vcaddq : MVEMXPredicated<
   [llvm_anyvector_ty],
   [llvm_i32_ty, llvm_i32_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-   LLVMMatchType<0>, llvm_anyvector_ty, [IntrNoMem]>;
+   LLVMMatchType<0>, llvm_anyvector_ty>;
 
 // The first operand of the following two intrinsics is the rotation angle
 // (must be a compile-time constant):
@@ -1058,12 +1061,12 @@ defm int_arm_mve_vcaddq : MVEMXPredicated<
 defm int_arm_mve_vcmulq : MVEMXPredicated<
   [llvm_anyvector_ty],
   [llvm_i32_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-   LLVMMatchType<0>, llvm_anyvector_ty, [IntrNoMem]>;
+   LLVMMatchType<0>, llvm_anyvector_ty>;
 
 defm int_arm_mve_vcmlaq : MVEPredicated<
   [llvm_anyvector_ty],
   [llvm_i32_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-   llvm_anyvector_ty, [IntrNoMem]>;
+   llvm_anyvector_ty>;
 
 def int_arm_mve_vld2q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>;
 def int_arm_mve_vld4q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>;
@@ -1072,9 +1075,6 @@ def int_arm_mve_vst2q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMat
 def int_arm_mve_vst4q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrWriteMem]
 >;
 
-def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
-
 // MVE vector absolute difference and accumulate across vector
 // The first operand is an 'unsigned' flag. The remaining operands are:
 // * accumulator
@@ -1083,8 +1083,7 @@ def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
 // * mask (only in predicated versions)
 defm int_arm_mve_vabav: MVEPredicated<
   [llvm_i32_ty],
-  [llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], llvm_anyvector_ty,
-  [IntrNoMem]>;
+  [llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], llvm_anyvector_ty>;
 
 // The following 3 instrinsics are MVE vector reductions with two vector
 // operands.
@@ -1107,19 +1106,19 @@ defm int_arm_mve_vmldava: MVEPredicated<
   [llvm_i32_ty],
   [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
    llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
-  llvm_anyvector_ty, [IntrNoMem]>;
+  llvm_anyvector_ty>;
 
 // Version with 64-bit result, vml{a,s}ldav[a][x]
 defm int_arm_mve_vmlldava: MVEPredicated<
   [llvm_i32_ty, llvm_i32_ty],
   [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
-  llvm_anyvector_ty, [IntrNoMem]>;
+  llvm_anyvector_ty>;
 
 // Version with 72-bit rounded result, vrml{a,s}ldavh[a][x]
 defm int_arm_mve_vrmlldavha: MVEPredicated<
   [llvm_i32_ty, llvm_i32_ty],
   [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
-  llvm_anyvector_ty, [IntrNoMem]>;
+  llvm_anyvector_ty>;
 } // end TargetPrefix
diff --git a/llvm/include/llvm/IR/ValueHandle.h b/llvm/include/llvm/IR/ValueHandle.h
index 11ac2a8608d81..50b7701f67162 100644
--- a/llvm/include/llvm/IR/ValueHandle.h
+++ b/llvm/include/llvm/IR/ValueHandle.h
@@ -171,6 +171,25 @@ template <> struct simplify_type<const WeakVH> {
   static SimpleType getSimplifiedValue(const WeakVH &WVH) { return WVH; }
 };
 
+// Specialize DenseMapInfo to allow WeakVH to participate in DenseMap.
+template <> struct DenseMapInfo<WeakVH> {
+  static inline WeakVH getEmptyKey() {
+    return WeakVH(DenseMapInfo<Value *>::getEmptyKey());
+  }
+
+  static inline WeakVH getTombstoneKey() {
+    return WeakVH(DenseMapInfo<Value *>::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const WeakVH &Val) {
+    return DenseMapInfo<Value *>::getHashValue(Val);
+  }
+
+  static bool isEqual(const WeakVH &LHS, const WeakVH &RHS) {
+    return DenseMapInfo<Value *>::isEqual(LHS, RHS);
+  }
+};
+
 /// Value handle that is nullable, but tries to track the Value.
 ///
 /// This is a value handle that tries hard to point to a Value, even across
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 004a6f5f6eb80..37b9669cbeed9 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -732,6 +732,11 @@ inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
   return alignTo(Numerator, Denominator) / Denominator;
 }
 
+/// Returns the integer nearest(Numerator / Denominator).
+inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
+  return (Numerator + (Denominator / 2)) / Denominator;
+}
+
 /// Returns the largest uint64_t less than or equal to \p Value and is
 /// \p Skew mod \p Align. \p Align must be non-zero
 inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
diff --git a/llvm/include/llvm/Support/Memory.h b/llvm/include/llvm/Support/Memory.h
index 6f22dd7080cd5..c0454223b2fd1 100644
--- a/llvm/include/llvm/Support/Memory.h
+++ b/llvm/include/llvm/Support/Memory.h
@@ -57,6 +57,17 @@ namespace sys {
       MF_WRITE = 0x2000000,
       MF_EXEC = 0x4000000,
       MF_RWE_MASK = 0x7000000,
+
+      /// The \p MF_HUGE_HINT flag is used to indicate that the request for
+      /// a memory block should be satisfied with large pages if possible.
+      /// This is only a hint and small pages will be used as fallback.
+      ///
+      /// The presence or absence of this flag in the returned memory block
+      /// is (at least currently) *not* a reliable indicator that the memory
+      /// block will use or will not use large pages. On some systems a request
+      /// without this flag can be backed by large pages without this flag being
+      /// set, and on some other systems a request with this flag can fallback
+      /// to small pages without this flag being cleared.
       MF_HUGE_HINT = 0x0000001
     };
 
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 176ae39b17a7c..39422ac3bf8ca 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -33,6 +33,7 @@ class MCInstrInfo;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCSymbol;
+class MIRFormatter;
 class raw_pwrite_stream;
 class PassManagerBuilder;
 struct PerFunctionMIParsingState;
@@ -94,6 +95,7 @@ class TargetMachine {
   std::unique_ptr<const MCRegisterInfo> MRI;
   std::unique_ptr<const MCInstrInfo> MII;
   std::unique_ptr<const MCSubtargetInfo> STI;
+  std::unique_ptr<const MIRFormatter> MIRF;
 
   unsigned RequireStructuredCFG : 1;
   unsigned O0WantsFastISel : 1;
@@ -197,6 +199,10 @@ class TargetMachine {
     return nullptr;
   }
 
+  /// Return MIR formatter to format/parse MIR operands.  Target can override
+  /// this virtual function and return target specific MIR formatter.
+  virtual const MIRFormatter *getMIRFormatter() const { return MIRF.get(); }
+
   bool requiresStructuredCFG() const { return RequireStructuredCFG; }
   void setRequiresStructuredCFG(bool Value) { RequireStructuredCFG = Value; }
 
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 53236b54ff0f0..70642b201adb3 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -865,6 +865,13 @@ struct Attributor {
     ToBeChangedToUnreachableInsts.insert(I);
   }
 
+  /// Record that \p II has at least one dead successor block. This information
+  /// is used, e.g., to replace \p II with a call, after information was
+  /// manifested.
+  void registerInvokeWithDeadSuccessor(InvokeInst &II) {
+    InvokeWithDeadSuccessor.push_back(&II);
+  }
+
   /// Record that \p I is deleted after information was manifested. This also
   /// triggers deletion of trivially dead istructions.
   void deleteAfterManifest(Instruction &I) { ToBeDeletedInsts.insert(&I); }
@@ -1174,7 +1181,10 @@ struct Attributor {
   DenseMap<Use *, Value *> ToBeChangedUses;
 
   /// Instructions we replace with `unreachable` insts after manifest is done.
-  SmallPtrSet<Instruction *, 8> ToBeChangedToUnreachableInsts;
+  SmallDenseSet<WeakVH, 16> ToBeChangedToUnreachableInsts;
+
+  /// Invoke instructions with at least a single dead successor block.
+  SmallVector<WeakVH, 16> InvokeWithDeadSuccessor;
 
   /// Functions, blocks, and instructions we delete after manifest is done.
   ///
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h b/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
index 7920269b0fb27..233963528595e 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
@@ -15,9 +15,7 @@
 
 namespace llvm {
 
-class Loop;
-struct LoopStandardAnalysisResults;
-class LPMUpdater;
+class Function;
 
 /// A simple loop rotation transformation.
 class LoopUnrollAndJamPass : public PassInfoMixin<LoopUnrollAndJamPass> {
@@ -25,8 +23,7 @@ class LoopUnrollAndJamPass : public PassInfoMixin<LoopUnrollAndJamPass> {
 
 public:
   explicit LoopUnrollAndJamPass(int OptLevel = 2) : OptLevel(OptLevel) {}
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
-                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index afcca2ab1fa3a..d7510c8991013 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -3996,6 +3996,15 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
       return FalseVal;
   }
 
+  // select i1 Cond, i1 true, i1 false --> i1 Cond
+  assert(Cond->getType()->isIntOrIntVectorTy(1) &&
+         "Select must have bool or bool vector condition");
+  assert(TrueVal->getType() == FalseVal->getType() &&
+         "Select must have same types for true/false ops");
+  if (Cond->getType() == TrueVal->getType() &&
+      match(TrueVal, m_One()) && match(FalseVal, m_ZeroInt()))
+    return Cond;
+
   // select ?, X, X -> X
   if (TrueVal == FalseVal)
     return TrueVal;
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 21511586ff185..0c35a91f8282e 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -242,6 +242,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("jump-table", MIToken::kw_jump_table)
       .Case("constant-pool", MIToken::kw_constant_pool)
       .Case("call-entry", MIToken::kw_call_entry)
+      .Case("custom", MIToken::kw_custom)
       .Case("liveout", MIToken::kw_liveout)
       .Case("address-taken", MIToken::kw_address_taken)
       .Case("landing-pad", MIToken::kw_landing_pad)
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index 1e2eba91ceb53..af5327cacfea5 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -110,6 +110,7 @@ struct MIToken {
     kw_jump_table,
     kw_constant_pool,
     kw_call_entry,
+    kw_custom,
     kw_liveout,
     kw_address_taken,
     kw_landing_pad,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 525c70016a0fb..0f2648e2bfac5 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -28,6 +28,7 @@
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -343,6 +344,37 @@ VRegInfo &PerFunctionMIParsingState::getVRegInfoNamed(StringRef RegName) {
   return *I.first->second;
 }
 
+static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
+                           DenseMap<unsigned, const Value *> &Slots2Values) {
+  int Slot = MST.getLocalSlot(V);
+  if (Slot == -1)
+    return;
+  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
+}
+
+/// Creates the mapping from slot numbers to function's unnamed IR values.
+static void initSlots2Values(const Function &F,
+                             DenseMap<unsigned, const Value *> &Slots2Values) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (const auto &Arg : F.args())
+    mapValueToSlot(&Arg, MST, Slots2Values);
+  for (const auto &BB : F) {
+    mapValueToSlot(&BB, MST, Slots2Values);
+    for (const auto &I : BB)
+      mapValueToSlot(&I, MST, Slots2Values);
+  }
+}
+
+const Value* PerFunctionMIParsingState::getIRValue(unsigned Slot) {
+  if (Slots2Values.empty())
+    initSlots2Values(MF.getFunction(), Slots2Values);
+  auto ValueInfo = Slots2Values.find(Slot);
+  if (ValueInfo == Slots2Values.end())
+    return nullptr;
+  return ValueInfo->second;
+}
+
 namespace {
 
 /// A wrapper struct around the 'MachineOperand' struct that includes a source
@@ -370,8 +402,6 @@ class MIParser {
   PerFunctionMIParsingState &PFS;
   /// Maps from slot numbers to function's unnamed basic blocks.
   DenseMap<unsigned, const BasicBlock *> Slots2BasicBlocks;
-  /// Maps from slot numbers to function's unnamed values.
-  DenseMap<unsigned, const Value *> Slots2Values;
 
 public:
   MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
@@ -455,9 +485,12 @@ class MIParser {
   bool parseTargetIndexOperand(MachineOperand &Dest);
   bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
-  bool parseMachineOperand(MachineOperand &Dest,
+  bool parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
+                           MachineOperand &Dest,
                            Optional<unsigned> &TiedDefIdx);
-  bool parseMachineOperandAndTargetFlags(MachineOperand &Dest,
+  bool parseMachineOperandAndTargetFlags(const unsigned OpCode,
+                                         const unsigned OpIdx,
+                                         MachineOperand &Dest,
                                          Optional<unsigned> &TiedDefIdx);
   bool parseOffset(int64_t &Offset);
   bool parseAlignment(unsigned &Alignment);
@@ -473,6 +506,9 @@ class MIParser {
   bool parsePreOrPostInstrSymbol(MCSymbol *&Symbol);
   bool parseHeapAllocMarker(MDNode *&Node);
 
+  bool parseTargetImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                              MachineOperand &Dest, const MIRFormatter &MF);
+
 private:
   /// Convert the integer literal in the current token into an unsigned integer.
   ///
@@ -551,6 +587,9 @@ bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
   return true;
 }
 
+typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
+    ErrorCallbackType;
+
 static const char *toString(MIToken::TokenKind TokenKind) {
   switch (TokenKind) {
   case MIToken::comma:
@@ -912,7 +951,7 @@ bool MIParser::parse(MachineInstr *&MI) {
          Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) {
     auto Loc = Token.location();
     Optional<unsigned> TiedDefIdx;
-    if (parseMachineOperandAndTargetFlags(MO, TiedDefIdx))
+    if (parseMachineOperandAndTargetFlags(OpCode, Operands.size(), MO, TiedDefIdx))
       return true;
     if (OpCode == TargetOpcode::DBG_VALUE && MO.isReg())
       MO.setIsDebug();
@@ -1493,17 +1532,61 @@ bool MIParser::parseImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
-                               const Constant *&C) {
+bool MIParser::parseTargetImmMnemonic(const unsigned OpCode,
+                                      const unsigned OpIdx,
+                                      MachineOperand &Dest,
+                                      const MIRFormatter &MF) {
+  assert(Token.is(MIToken::dot));
+  auto Loc = Token.location(); // record start position
+  size_t Len = 1;              // for "."
+  lex();
+
+  // Handle the case that mnemonic starts with number.
+  if (Token.is(MIToken::IntegerLiteral)) {
+    Len += Token.range().size();
+    lex();
+  }
+
+  StringRef Src;
+  if (Token.is(MIToken::comma))
+    Src = StringRef(Loc, Len);
+  else {
+    assert(Token.is(MIToken::Identifier));
+    Src = StringRef(Loc, Len + Token.stringValue().size());
+  }
+  int64_t Val;
+  if (MF.parseImmMnemonic(OpCode, OpIdx, Src, Val,
+                          [this](StringRef::iterator Loc, const Twine &Msg)
+                              -> bool { return error(Loc, Msg); }))
+    return true;
+
+  Dest = MachineOperand::CreateImm(Val);
+  if (!Token.is(MIToken::comma))
+    lex();
+  return false;
+}
+
+static bool parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                            PerFunctionMIParsingState &PFS, const Constant *&C,
+                            ErrorCallbackType ErrCB) {
   auto Source = StringValue.str(); // The source has to be null terminated.
   SMDiagnostic Err;
-  C = parseConstantValue(Source, Err, *MF.getFunction().getParent(),
+  C = parseConstantValue(Source, Err, *PFS.MF.getFunction().getParent(),
                          &PFS.IRSlots);
   if (!C)
-    return error(Loc + Err.getColumnNo(), Err.getMessage());
+    return ErrCB(Loc + Err.getColumnNo(), Err.getMessage());
   return false;
 }
 
+bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                               const Constant *&C) {
+  return ::parseIRConstant(
+      Loc, StringValue, PFS, C,
+      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) {
   if (parseIRConstant(Loc, StringRef(Loc, Token.range().end() - Loc), C))
     return true;
@@ -1636,27 +1719,52 @@ bool MIParser::parseFPImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::getUnsigned(unsigned &Result) {
+static bool getHexUint(const MIToken &Token, APInt &Result) {
+  assert(Token.is(MIToken::HexLiteral));
+  StringRef S = Token.range();
+  assert(S[0] == '0' && tolower(S[1]) == 'x');
+  // This could be a floating point literal with a special prefix.
+  if (!isxdigit(S[2]))
+    return true;
+  StringRef V = S.substr(2);
+  APInt A(V.size()*4, V, 16);
+
+  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
+  // sure it isn't the case before constructing result.
+  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
+  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
+  return false;
+}
+
+bool getUnsigned(const MIToken &Token, unsigned &Result,
+                 ErrorCallbackType ErrCB) {
   if (Token.hasIntegerValue()) {
     const uint64_t Limit = uint64_t(std::numeric_limits<unsigned>::max()) + 1;
     uint64_t Val64 = Token.integerValue().getLimitedValue(Limit);
     if (Val64 == Limit)
-      return error("expected 32-bit integer (too large)");
+      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
     Result = Val64;
     return false;
   }
   if (Token.is(MIToken::HexLiteral)) {
     APInt A;
-    if (getHexUint(A))
+    if (getHexUint(Token, A))
       return true;
     if (A.getBitWidth() > 32)
-      return error("expected 32-bit integer (too large)");
+      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
     Result = A.getZExtValue();
     return false;
   }
   return true;
 }
 
+bool MIParser::getUnsigned(unsigned &Result) {
+  return ::getUnsigned(
+      Token, Result, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseMBBReference(MachineBasicBlock *&MBB) {
   assert(Token.is(MIToken::MachineBasicBlock) ||
          Token.is(MIToken::MachineBasicBlockLabel));
@@ -1736,23 +1844,25 @@ bool MIParser::parseFixedStackObjectOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseGlobalValue(GlobalValue *&GV) {
+static bool parseGlobalValue(const MIToken &Token,
+                             PerFunctionMIParsingState &PFS, GlobalValue *&GV,
+                             ErrorCallbackType ErrCB) {
   switch (Token.kind()) {
   case MIToken::NamedGlobalValue: {
-    const Module *M = MF.getFunction().getParent();
+    const Module *M = PFS.MF.getFunction().getParent();
     GV = M->getNamedValue(Token.stringValue());
     if (!GV)
-      return error(Twine("use of undefined global value '") + Token.range() +
-                   "'");
+      return ErrCB(Token.location(), Twine("use of undefined global value '") +
+                                         Token.range() + "'");
     break;
   }
   case MIToken::GlobalValue: {
     unsigned GVIdx;
-    if (getUnsigned(GVIdx))
+    if (getUnsigned(Token, GVIdx, ErrCB))
       return true;
     if (GVIdx >= PFS.IRSlots.GlobalValues.size())
-      return error(Twine("use of undefined global value '@") + Twine(GVIdx) +
-                   "'");
+      return ErrCB(Token.location(), Twine("use of undefined global value '@") +
+                                         Twine(GVIdx) + "'");
     GV = PFS.IRSlots.GlobalValues[GVIdx];
     break;
   }
@@ -1762,6 +1872,14 @@ bool MIParser::parseGlobalValue(GlobalValue *&GV) {
   return false;
 }
 
+bool MIParser::parseGlobalValue(GlobalValue *&GV) {
+  return ::parseGlobalValue(
+      Token, PFS, GV,
+      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
   GlobalValue *GV = nullptr;
   if (parseGlobalValue(GV))
@@ -2410,7 +2528,8 @@ bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseMachineOperand(MachineOperand &Dest,
+bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
+                                   MachineOperand &Dest,
                                    Optional<unsigned> &TiedDefIdx) {
   switch (Token.kind()) {
   case MIToken::kw_implicit:
@@ -2499,6 +2618,12 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
       return parseCustomRegisterMaskOperand(Dest);
     } else
       return parseTypedImmediateOperand(Dest);
+  case MIToken::dot: {
+    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
+      return parseTargetImmMnemonic(OpCode, OpIdx, Dest, *Formatter);
+    }
+    LLVM_FALLTHROUGH;
+  }
   default:
     // FIXME: Parse the MCSymbol machine operand.
     return error("expected a machine operand");
@@ -2507,7 +2632,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
 }
 
 bool MIParser::parseMachineOperandAndTargetFlags(
-    MachineOperand &Dest, Optional<unsigned> &TiedDefIdx) {
+    const unsigned OpCode, const unsigned OpIdx, MachineOperand &Dest,
+    Optional<unsigned> &TiedDefIdx) {
   unsigned TF = 0;
   bool HasTargetFlags = false;
   if (Token.is(MIToken::kw_target_flags)) {
@@ -2539,7 +2665,7 @@ bool MIParser::parseMachineOperandAndTargetFlags(
       return true;
   }
   auto Loc = Token.location();
-  if (parseMachineOperand(Dest, TiedDefIdx))
+  if (parseMachineOperand(OpCode, OpIdx, Dest, TiedDefIdx))
     return true;
   if (!HasTargetFlags)
     return false;
@@ -2600,30 +2726,31 @@ bool MIParser::parseOperandsOffset(MachineOperand &Op) {
   return false;
 }
 
-bool MIParser::parseIRValue(const Value *&V) {
+static bool parseIRValue(const MIToken &Token, PerFunctionMIParsingState &PFS,
+                         const Value *&V, ErrorCallbackType ErrCB) {
   switch (Token.kind()) {
   case MIToken::NamedIRValue: {
-    V = MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
+    V = PFS.MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
     break;
   }
   case MIToken::IRValue: {
     unsigned SlotNumber = 0;
-    if (getUnsigned(SlotNumber))
+    if (getUnsigned(Token, SlotNumber, ErrCB))
       return true;
-    V = getIRValue(SlotNumber);
+    V = PFS.getIRValue(SlotNumber);
     break;
   }
   case MIToken::NamedGlobalValue:
   case MIToken::GlobalValue: {
     GlobalValue *GV = nullptr;
-    if (parseGlobalValue(GV))
+    if (parseGlobalValue(Token, PFS, GV, ErrCB))
       return true;
     V = GV;
     break;
   }
   case MIToken::QuotedIRValue: {
     const Constant *C = nullptr;
-    if (parseIRConstant(Token.location(), Token.stringValue(), C))
+    if (parseIRConstant(Token.location(), Token.stringValue(), PFS, C, ErrCB))
       return true;
     V = C;
     break;
@@ -2632,10 +2759,17 @@ bool MIParser::parseIRValue(const Value *&V) {
     llvm_unreachable("The current token should be an IR block reference");
   }
   if (!V)
-    return error(Twine("use of undefined IR value '") + Token.range() + "'");
+    return ErrCB(Token.location(), Twine("use of undefined IR value '") + Token.range() + "'");
   return false;
 }
 
+bool MIParser::parseIRValue(const Value *&V) {
+  return ::parseIRValue(
+      Token, PFS, V, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::getUint64(uint64_t &Result) {
   if (Token.hasIntegerValue()) {
     if (Token.integerValue().getActiveBits() > 64)
@@ -2656,20 +2790,7 @@ bool MIParser::getUint64(uint64_t &Result) {
 }
 
 bool MIParser::getHexUint(APInt &Result) {
-  assert(Token.is(MIToken::HexLiteral));
-  StringRef S = Token.range();
-  assert(S[0] == '0' && tolower(S[1]) == 'x');
-  // This could be a floating point literal with a special prefix.
-  if (!isxdigit(S[2]))
-    return true;
-  StringRef V = S.substr(2);
-  APInt A(V.size()*4, V, 16);
-
-  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
-  // sure it isn't the case before constructing result.
-  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
-  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
-  return false;
+  return ::getHexUint(Token, Result);
 }
 
 bool MIParser::parseMemoryOperandFlag(MachineMemOperand::Flags &Flags) {
@@ -2756,6 +2877,19 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
           "expected a global value or an external symbol after 'call-entry'");
     }
     break;
+  case MIToken::kw_custom: {
+    lex();
+    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
+      if (Formatter->parseCustomPseudoSourceValue(
+              Token.stringValue(), MF, PFS, PSV,
+              [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+                return error(Loc, Msg);
+              }))
+        return true;
+    } else
+      return error("unable to parse target custom pseudo source value");
+    break;
+  }
   default:
     llvm_unreachable("The current token should be pseudo source value");
   }
@@ -2767,7 +2901,7 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
   if (Token.is(MIToken::kw_constant_pool) || Token.is(MIToken::kw_stack) ||
       Token.is(MIToken::kw_got) || Token.is(MIToken::kw_jump_table) ||
       Token.is(MIToken::FixedStackObject) || Token.is(MIToken::StackObject) ||
-      Token.is(MIToken::kw_call_entry)) {
+      Token.is(MIToken::kw_call_entry) || Token.is(MIToken::kw_custom)) {
     const PseudoSourceValue *PSV = nullptr;
     if (parseMemoryPseudoSourceValue(PSV))
       return true;
@@ -3018,35 +3152,8 @@ const BasicBlock *MIParser::getIRBlock(unsigned Slot, const Function &F) {
   return getIRBlockFromSlot(Slot, CustomSlots2BasicBlocks);
 }
 
-static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
-                           DenseMap<unsigned, const Value *> &Slots2Values) {
-  int Slot = MST.getLocalSlot(V);
-  if (Slot == -1)
-    return;
-  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
-}
-
-/// Creates the mapping from slot numbers to function's unnamed IR values.
-static void initSlots2Values(const Function &F,
-                             DenseMap<unsigned, const Value *> &Slots2Values) {
-  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
-  MST.incorporateFunction(F);
-  for (const auto &Arg : F.args())
-    mapValueToSlot(&Arg, MST, Slots2Values);
-  for (const auto &BB : F) {
-    mapValueToSlot(&BB, MST, Slots2Values);
-    for (const auto &I : BB)
-      mapValueToSlot(&I, MST, Slots2Values);
-  }
-}
-
 const Value *MIParser::getIRValue(unsigned Slot) {
-  if (Slots2Values.empty())
-    initSlots2Values(MF.getFunction(), Slots2Values);
-  auto ValueInfo = Slots2Values.find(Slot);
-  if (ValueInfo == Slots2Values.end())
-    return nullptr;
-  return ValueInfo->second;
+  return PFS.getIRValue(Slot);
 }
 
 MCSymbol *MIParser::getOrCreateMCSymbol(StringRef Name) {
@@ -3111,3 +3218,15 @@ bool llvm::parseMDNode(PerFunctionMIParsingState &PFS,
                        MDNode *&Node, StringRef Src, SMDiagnostic &Error) {
   return MIParser(PFS, Error, Src).parseStandaloneMDNode(Node);
 }
+
+bool MIRFormatter::parseIRValue(StringRef Src, MachineFunction &MF,
+                                PerFunctionMIParsingState &PFS, const Value *&V,
+                                ErrorCallbackType ErrorCallback) {
+  MIToken Token;
+  Src = lexMIToken(Src, Token, [&](StringRef::iterator Loc, const Twine &Msg) {
+    ErrorCallback(Loc, Msg);
+  });
+  V = nullptr;
+
+  return ::parseIRValue(Token, PFS, V, ErrorCallback);
+}
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index b06e34a809fca..9d9c12a95918c 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -709,6 +709,7 @@ void MIPrinter::print(const MachineInstr &MI) {
   const auto *TRI = SubTarget.getRegisterInfo();
   assert(TRI && "Expected target register info");
   const auto *TII = SubTarget.getInstrInfo();
+  const auto *MIRF = MF->getTarget().getMIRFormatter();
   assert(TII && "Expected target instruction info");
   if (MI.isCFIInstruction())
     assert(MI.getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -807,7 +808,7 @@ void MIPrinter::print(const MachineInstr &MI) {
     for (const auto *Op : MI.memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, Context, &MFI, TII);
+      Op->print(OS, MST, SSNs, Context, &MFI, TII, MIRF);
       NeedComma = true;
     }
   }
@@ -856,7 +857,7 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
     if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
       TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
     const TargetIntrinsicInfo *TII = MI.getMF()->getTarget().getIntrinsicInfo();
-    Op.print(OS, MST, TypeToPrint, PrintDef, /*IsStandalone=*/false,
+    Op.print(OS, MST, TypeToPrint, OpIdx, PrintDef, /*IsStandalone=*/false,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, TII);
     break;
   }
@@ -874,6 +875,28 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
   }
 }
 
+void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
+                                ModuleSlotTracker &MST) {
+  if (isa<GlobalValue>(V)) {
+    V.printAsOperand(OS, /*PrintType=*/false, MST);
+    return;
+  }
+  if (isa<Constant>(V)) {
+    // Machine memory operands can load/store to/from constant value pointers.
+    OS << '`';
+    V.printAsOperand(OS, /*PrintType=*/true, MST);
+    OS << '`';
+    return;
+  }
+  OS << "%ir.";
+  if (V.hasName()) {
+    printLLVMNameWithoutPrefix(OS, V.getName());
+    return;
+  }
+  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
+  MachineOperand::printIRSlotNumber(OS, Slot);
+}
+
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 16ae732169a48..177fef80e2e68 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -89,13 +89,15 @@ static void tryToGetTargetInfo(const MachineInstr &MI,
                                const TargetRegisterInfo *&TRI,
                                const MachineRegisterInfo *&MRI,
                                const TargetIntrinsicInfo *&IntrinsicInfo,
-                               const TargetInstrInfo *&TII) {
+                               const TargetInstrInfo *&TII,
+                               const MIRFormatter *&MIRF) {
 
   if (const MachineFunction *MF = getMFIfAvailable(MI)) {
     TRI = MF->getSubtarget().getRegisterInfo();
     MRI = &MF->getRegInfo();
     IntrinsicInfo = MF->getTarget().getIntrinsicInfo();
     TII = MF->getSubtarget().getInstrInfo();
+    MIRF = MF->getTarget().getMIRFormatter();
   }
 }
 
@@ -1477,7 +1479,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
   const TargetRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
   const TargetIntrinsicInfo *IntrinsicInfo = nullptr;
-  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII);
+  const MIRFormatter *MIRF = nullptr;
+  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII, MIRF);
 
   if (isCFIInstruction())
     assert(getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -1506,7 +1509,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
     LLT TypeToPrint = MRI ? getTypeToPrint(StartOp, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(StartOp);
-    MO.print(OS, MST, TypeToPrint, /*PrintDef=*/false, IsStandalone,
+    MO.print(OS, MST, TypeToPrint, StartOp, /*PrintDef=*/false, IsStandalone,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     ++StartOp;
   }
@@ -1561,7 +1564,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     const unsigned OpIdx = InlineAsm::MIOp_AsmString;
     LLT TypeToPrint = MRI ? getTypeToPrint(OpIdx, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(OpIdx);
-    getOperand(OpIdx).print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+    getOperand(OpIdx).print(OS, MST, TypeToPrint, OpIdx, /*PrintDef=*/true, IsStandalone,
                             ShouldPrintRegisterTies, TiedOperandIdx, TRI,
                             IntrinsicInfo);
 
@@ -1600,7 +1603,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (isDebugLabel() && MO.isMetadata()) {
@@ -1611,7 +1614,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (i == AsmDescOp && MO.isImm()) {
@@ -1678,7 +1681,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       if (MO.isImm() && isOperandSubregIdx(i))
         MachineOperand::printSubRegIdx(OS, MO.getImm(), TRI);
       else
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     }
   }
@@ -1737,7 +1740,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     for (const MachineMemOperand *Op : memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, *Context, MFI, TII);
+      Op->print(OS, MST, SSNs, *Context, MFI, TII, MIRF);
       NeedComma = true;
     }
   }
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 8b19501ec3cf1..5dd98467ba663 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -458,28 +459,6 @@ static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB,
     OS << "<unknown>";
 }
 
-static void printIRValueReference(raw_ostream &OS, const Value &V,
-                                  ModuleSlotTracker &MST) {
-  if (isa<GlobalValue>(V)) {
-    V.printAsOperand(OS, /*PrintType=*/false, MST);
-    return;
-  }
-  if (isa<Constant>(V)) {
-    // Machine memory operands can load/store to/from constant value pointers.
-    OS << '`';
-    V.printAsOperand(OS, /*PrintType=*/true, MST);
-    OS << '`';
-    return;
-  }
-  OS << "%ir.";
-  if (V.hasName()) {
-    printLLVMNameWithoutPrefix(OS, V.getName());
-    return;
-  }
-  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
-  MachineOperand::printIRSlotNumber(OS, Slot);
-}
-
 static void printSyncScope(raw_ostream &OS, const LLVMContext &Context,
                            SyncScope::ID SSID,
                            SmallVectorImpl<StringRef> &SSNs) {
@@ -734,14 +713,15 @@ void MachineOperand::print(raw_ostream &OS, LLT TypeToPrint,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
   tryToGetTargetInfo(*this, TRI, IntrinsicInfo);
   ModuleSlotTracker DummyMST(nullptr);
-  print(OS, DummyMST, TypeToPrint, /*PrintDef=*/false, /*IsStandalone=*/true,
+  print(OS, DummyMST, TypeToPrint, None, /*PrintDef=*/false,
+        /*IsStandalone=*/true,
         /*ShouldPrintRegisterTies=*/true,
         /*TiedOperandIdx=*/0, TRI, IntrinsicInfo);
 }
 
 void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                           LLT TypeToPrint, bool PrintDef, bool IsStandalone,
-                           bool ShouldPrintRegisterTies,
+                           LLT TypeToPrint, Optional<unsigned> OpIdx, bool PrintDef,
+                           bool IsStandalone, bool ShouldPrintRegisterTies,
                            unsigned TiedOperandIdx,
                            const TargetRegisterInfo *TRI,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
@@ -802,9 +782,16 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << '(' << TypeToPrint << ')';
     break;
   }
-  case MachineOperand::MO_Immediate:
-    OS << getImm();
+  case MachineOperand::MO_Immediate: {
+    const MIRFormatter *Formatter = nullptr;
+    if (const MachineFunction *MF = getMFIfAvailable(*this))
+      Formatter = MF->getTarget().getMIRFormatter();
+    if (Formatter)
+      Formatter->printImm(OS, *getParent(), OpIdx, getImm());
+    else
+      OS << getImm();
     break;
+  }
   case MachineOperand::MO_CImmediate:
     getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
     break;
@@ -1070,7 +1057,8 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
                               SmallVectorImpl<StringRef> &SSNs,
                               const LLVMContext &Context,
                               const MachineFrameInfo *MFI,
-                              const TargetInstrInfo *TII) const {
+                              const TargetInstrInfo *TII,
+                              const MIRFormatter* MIRF) const {
   OS << '(';
   if (isVolatile())
     OS << "volatile ";
@@ -1111,7 +1099,7 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   if (const Value *Val = getValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
-    printIRValueReference(OS, *Val, MST);
+    MIRFormatter::printIRValue(OS, *Val, MST);
   } else if (const PseudoSourceValue *PVal = getPseudoValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
     assert(PVal && "Expected a pseudo source value");
@@ -1144,15 +1132,20 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       printLLVMNameWithoutPrefix(
           OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol());
       break;
-    default:
+    default: {
       // FIXME: This is not necessarily the correct MIR serialization format for
       // a custom pseudo source value, but at least it allows
       // -print-machineinstrs to work on a target with custom pseudo source
       // values.
-      OS << "custom ";
-      PVal->printCustom(OS);
+      OS << "custom \"";
+      if (MIRF)
+        MIRF->printCustomPseudoSourceValue(OS, MST, *PVal);
+      else
+        PVal->printCustom(OS);
+      OS << '\"';
       break;
     }
+    }
   }
   MachineOperand::printOperandOffset(OS, getOffset());
   if (getBaseAlignment() != getSize())
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index ca57e51268e88..b2534c2e53d4a 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2312,6 +2312,32 @@ void MachineVerifier::visitMachineFunctionAfter() {
   if (LiveInts)
     verifyLiveIntervals();
 
+  // Check live-in list of each MBB. If a register is live into MBB, check
+  // that the register is in regsLiveOut of each predecessor block. Since
+  // this must come from a definition in the predecesssor or its live-in
+  // list, this will catch a live-through case where the predecessor does not
+  // have the register in its live-in list.  This currently only checks
+  // registers that have no aliases, are not allocatable and are not
+  // reserved, which could mean a condition code register for instance.
+  if (MRI->tracksLiveness())
+    for (const auto &MBB : *MF)
+      for (MachineBasicBlock::RegisterMaskPair P : MBB.liveins()) {
+        MCPhysReg LiveInReg = P.PhysReg;
+        bool hasAliases = MCRegAliasIterator(LiveInReg, TRI, false).isValid();
+        if (hasAliases || isAllocatable(LiveInReg) || isReserved(LiveInReg))
+          continue;
+        for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+          BBInfo &PInfo = MBBInfoMap[Pred];
+          if (!PInfo.regsLiveOut.count(LiveInReg)) {
+            report("Live in register not found to be live out from predecessor.",
+                   &MBB);
+            errs() << TRI->getName(LiveInReg)
+                   << " not found to be live out from "
+                   << printMBBReference(*Pred) << "\n";
+          }
+        }
+      }
+
   for (auto CSInfo : MF->getCallSitesInfo())
     if (!CSInfo.first->isCall())
       report("Call site info referencing instruction that is not call", MF);
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index d583643ac68f7..3909b57172814 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -259,10 +259,6 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
   for (auto &I : EntryDbgValues)
     I.first->insert(I.first->begin(), I.second.begin(), I.second.end());
 
-  // Allow the target machine to make final modifications to the function
-  // before the frame layout is finalized.
-  TFI->processFunctionBeforeFrameIndicesReplaced(MF, RS);
-
   // Replace all MO_FrameIndex operands with physical register references
   // and actual offsets.
   //
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cfc4671eaa0e4..6030c95742015 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12639,6 +12639,15 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
     }
   }
 
+  // fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z))
+  // fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z))
+  if (!TLI.isFNegFree(VT) &&
+      TLI.isNegatibleForFree(SDValue(N, 0), DAG, LegalOperations,
+                             ForCodeSize) == 2)
+    return DAG.getNode(ISD::FNEG, DL, VT,
+                       TLI.getNegatedExpression(SDValue(N, 0), DAG,
+                                                LegalOperations, ForCodeSize),
+                       Flags);
   return SDValue();
 }
 
@@ -18585,8 +18594,22 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     if (ConcatSrcNumElts == ExtNumElts)
       return V.getOperand(ConcatOpIdx);
 
-    // TODO: Handle the case where the concat operands are larger than the
-    //       result of this extract by extracting directly from a concat op.
+    // If the concatenated source vectors are a multiple length of this extract,
+    // then extract a fraction of one of those source vectors directly from a
+    // concat operand. Example:
+    //   v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 -->
+    //   v2i8 extract_subvec v8i8 Y, 6
+    if (ConcatSrcNumElts % ExtNumElts == 0) {
+      SDLoc DL(N);
+      unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;
+      assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts &&
+             "Trying to extract from >1 concat operand?");
+      assert(NewExtIdx % ExtNumElts == 0 &&
+             "Extract index is not a multiple of the input vector length.");
+      SDValue NewIndexC = DAG.getIntPtrConstant(NewExtIdx, DL);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT,
+                         V.getOperand(ConcatOpIdx), NewIndexC);
+    }
   }
 
   V = peekThroughBitcasts(V);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 6fd71393bf38c..cbdcb93e60c90 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -480,7 +480,8 @@ static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
   if (MF)
     MST.incorporateFunction(MF->getFunction());
   SmallVector<StringRef, 0> SSNs;
-  MMO.print(OS, MST, SSNs, Ctx, MFI, TII);
+  MMO.print(OS, MST, SSNs, Ctx, MFI, TII,
+            MF ? MF->getTarget().getMIRFormatter() : nullptr);
 }
 
 static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index 8aed9ab653a16..0c5f9a9c54ec6 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -79,9 +79,9 @@ DWARFLocationInterpreter::Interpret(const DWARFLocationEntry &E) {
   }
   case dwarf::DW_LLE_offset_pair: {
     if (!Base) {
-      return createStringError(
-          inconvertibleErrorCode(),
-          "Unable to resolve DW_LLE_offset_pair: base address unknown");
+      return createStringError(inconvertibleErrorCode(),
+                               "Unable to resolve location list offset pair: "
+                               "Base address not defined");
     }
     DWARFAddressRange Range{Base->Address + E.Value0, Base->Address + E.Value1,
                             Base->SectionIndex};
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 63b98f26ba193..04e34a90a9bcf 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -147,9 +147,9 @@ LLVMAttributeRef LLVMCreateEnumAttribute(LLVMContextRef C, unsigned KindID,
     // After r362128, byval attributes need to have a type attribute. Provide a
     // NULL one until a proper API is added for this.
     return wrap(Attribute::getWithByValType(Ctx, NULL));
-  } else {
-    return wrap(Attribute::get(Ctx, AttrKind, Val));
   }
+
+  return wrap(Attribute::get(Ctx, AttrKind, Val));
 }
 
 unsigned LLVMGetEnumAttributeKind(LLVMAttributeRef A) {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 50648ba17945b..d232946af2942 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4779,7 +4779,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
 
   case Intrinsic::experimental_constrained_fcmp:
   case Intrinsic::experimental_constrained_fcmps: {
-    auto Pred = dyn_cast<ConstrainedFPCmpIntrinsic>(&FPI)->getPredicate();
+    auto Pred = cast<ConstrainedFPCmpIntrinsic>(&FPI)->getPredicate();
     Assert(CmpInst::isFPPredicate(Pred),
            "invalid predicate for constrained FP comparison intrinsic", &FPI);
     break;
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 0c4eb953aa4e6..dc8132b627a66 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -3130,7 +3130,9 @@ bool AsmParser::parseRealValue(const fltSemantics &Semantics, APInt &Res) {
       Value = APFloat::getNaN(Semantics, false, ~0);
     else
       return TokError("invalid floating point literal");
-  } else if (!Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven))
+  } else if (errorToBool(
+                 Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven)
+                     .takeError()))
     return TokError("invalid floating point literal");
   if (IsNeg)
     Value.changeSign();
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 64d748f94f144..646eb7d26cbdd 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -970,8 +970,7 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   // across the loop nests.
   // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
   if (EnableUnrollAndJam && PTO.LoopUnrolling) {
-    OptimizePM.addPass(
-        createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
+    OptimizePM.addPass(LoopUnrollAndJamPass(Level));
   }
   OptimizePM.addPass(LoopUnrollPass(
       LoopUnrollOptions(Level, /*OnlyWhenForced=*/!PTO.LoopUnrolling,
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 66b38872b386e..3efb57cd35890 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -235,6 +235,7 @@ FUNCTION_PASS("spec-phis", SpeculateAroundPHIsPass())
 FUNCTION_PASS("sroa", SROA())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
+FUNCTION_PASS("unroll-and-jam", LoopUnrollAndJamPass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 FUNCTION_PASS("verify<loops>", LoopVerifierPass())
@@ -307,7 +308,6 @@ LOOP_PASS("simplify-cfg", LoopSimplifyCFGPass())
 LOOP_PASS("strength-reduce", LoopStrengthReducePass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
 LOOP_PASS("irce", IRCEPass())
-LOOP_PASS("unroll-and-jam", LoopUnrollAndJamPass())
 LOOP_PASS("unroll-full", LoopFullUnrollPass())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
 LOOP_PASS("print<ddg>", DDGAnalysisPrinterPass(dbgs()))
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index f8a217d3535de..050c37baefb87 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -4518,9 +4518,8 @@ hash_code hash_value(const APFloat &Arg) {
 APFloat::APFloat(const fltSemantics &Semantics, StringRef S)
     : APFloat(Semantics) {
   auto StatusOrErr = convertFromString(S, rmNearestTiesToEven);
-  if (!StatusOrErr) {
-    assert(false && "Invalid floating point representation");
-  }
+  assert(StatusOrErr && "Invalid floating point representation");
+  consumeError(StatusOrErr.takeError());
 }
 
 APFloat::opStatus APFloat::convert(const fltSemantics &ToSemantics,
diff --git a/llvm/lib/Support/ARMTargetParser.cpp b/llvm/lib/Support/ARMTargetParser.cpp
index ce5daa7fe58c0..f2c22fd93c8b8 100644
--- a/llvm/lib/Support/ARMTargetParser.cpp
+++ b/llvm/lib/Support/ARMTargetParser.cpp
@@ -174,8 +174,6 @@ bool ARM::getFPUFeatures(unsigned FPUKind, std::vector<StringRef> &Features) {
     // under FPURestriction::None, which is the only FPURestriction in
     // which they would be valid (since FPURestriction::SP doesn't
     // exist).
-
-    {"+fpregs", "-fpregs", FPUVersion::VFPV2, FPURestriction::SP_D16},
     {"+vfp2", "-vfp2", FPUVersion::VFPV2, FPURestriction::D16},
     {"+vfp2sp", "-vfp2sp", FPUVersion::VFPV2, FPURestriction::SP_D16},
     {"+vfp3", "-vfp3", FPUVersion::VFPV3, FPURestriction::None},
diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index b5db172cc1a3a..104482de4ad70 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -588,13 +588,11 @@ bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
 
 bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
   APFloat F(0.0);
-  auto ErrOrStatus = F.convertFromString(*this, APFloat::rmNearestTiesToEven);
-  if (!ErrOrStatus) {
-    assert(false && "Invalid floating point representation");
+  auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven);
+  if (errorToBool(StatusOrErr.takeError()))
     return true;
-  }
 
-  APFloat::opStatus Status = *ErrOrStatus;
+  APFloat::opStatus Status = *StatusOrErr;
   if (Status != APFloat::opOK) {
     if (!AllowInexact || !(Status & APFloat::opInexact))
       return true;
diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp
index 618670553d091..773d6f05a9e5d 100644
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@@ -21,58 +21,59 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case UnknownArch:    return "unknown";
 
   case aarch64:        return "aarch64";
-  case aarch64_be:     return "aarch64_be";
   case aarch64_32:     return "aarch64_32";
+  case aarch64_be:     return "aarch64_be";
+  case amdgcn:         return "amdgcn";
+  case amdil64:        return "amdil64";
+  case amdil:          return "amdil";
+  case arc:            return "arc";
   case arm:            return "arm";
   case armeb:          return "armeb";
-  case arc:            return "arc";
   case avr:            return "avr";
-  case bpfel:          return "bpfel";
   case bpfeb:          return "bpfeb";
+  case bpfel:          return "bpfel";
+  case fpga_aoco:      return "fpga_aoco";
+  case fpga_aocr:      return "fpga_aocr";
+  case fpga_aocx:      return "fpga_aocx";
   case hexagon:        return "hexagon";
-  case mips:           return "mips";
-  case mipsel:         return "mipsel";
+  case hsail64:        return "hsail64";
+  case hsail:          return "hsail";
+  case kalimba:        return "kalimba";
+  case lanai:          return "lanai";
+  case le32:           return "le32";
+  case le64:           return "le64";
   case mips64:         return "mips64";
   case mips64el:       return "mips64el";
+  case mips:           return "mips";
+  case mipsel:         return "mipsel";
   case msp430:         return "msp430";
+  case nvptx64:        return "nvptx64";
+  case nvptx:          return "nvptx";
   case ppc64:          return "powerpc64";
   case ppc64le:        return "powerpc64le";
   case ppc:            return "powerpc";
   case r600:           return "r600";
-  case amdgcn:         return "amdgcn";
+  case renderscript32: return "renderscript32";
+  case renderscript64: return "renderscript64";
   case riscv32:        return "riscv32";
   case riscv64:        return "riscv64";
+  case shave:          return "shave";
   case sparc:          return "sparc";
-  case sparcv9:        return "sparcv9";
   case sparcel:        return "sparcel";
+  case sparcv9:        return "sparcv9";
+  case spir64:         return "spir64";
+  case spir:           return "spir";
   case systemz:        return "s390x";
   case tce:            return "tce";
   case tcele:          return "tcele";
   case thumb:          return "thumb";
   case thumbeb:        return "thumbeb";
+  case ve:             return "ve";
+  case wasm32:         return "wasm32";
+  case wasm64:         return "wasm64";
   case x86:            return "i386";
   case x86_64:         return "x86_64";
   case xcore:          return "xcore";
-  case nvptx:          return "nvptx";
-  case nvptx64:        return "nvptx64";
-  case le32:           return "le32";
-  case le64:           return "le64";
-  case amdil:          return "amdil";
-  case amdil64:        return "amdil64";
-  case hsail:          return "hsail";
-  case hsail64:        return "hsail64";
-  case spir:           return "spir";
-  case spir64:         return "spir64";
-  case kalimba:        return "kalimba";
-  case lanai:          return "lanai";
-  case shave:          return "shave";
-  case wasm32:         return "wasm32";
-  case wasm64:         return "wasm64";
-  case renderscript32: return "renderscript32";
-  case renderscript64: return "renderscript64";
-  case fpga_aoco:      return "fpga_aoco";
-  case fpga_aocr:      return "fpga_aocr";
-  case fpga_aocx:      return "fpga_aocx";
   }
 
   llvm_unreachable("Invalid ArchType!");
@@ -151,6 +152,8 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
   case fpga_aoco:
   case fpga_aocr:
   case fpga_aocx:      return "fpga";
+
+  case ve:          return "ve";
   }
 }
 
@@ -158,23 +161,23 @@ StringRef Triple::getVendorTypeName(VendorType Kind) {
   switch (Kind) {
   case UnknownVendor: return "unknown";
 
+  case AMD: return "amd";
   case Apple: return "apple";
-  case PC: return "pc";
-  case SCEI: return "scei";
   case BGP: return "bgp";
   case BGQ: return "bgq";
+  case CSR: return "csr";
   case Freescale: return "fsl";
   case IBM: return "ibm";
   case ImaginationTechnologies: return "img";
   case Intel: return "intel";
+  case Mesa: return "mesa";
   case MipsTechnologies: return "mti";
-  case NVIDIA: return "nvidia";
-  case CSR: return "csr";
   case Myriad: return "myriad";
-  case AMD: return "amd";
-  case Mesa: return "mesa";
-  case SUSE: return "suse";
+  case NVIDIA: return "nvidia";
   case OpenEmbedded: return "oe";
+  case PC: return "pc";
+  case SCEI: return "scei";
+  case SUSE: return "suse";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -184,41 +187,41 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   switch (Kind) {
   case UnknownOS: return "unknown";
 
+  case AIX: return "aix";
+  case AMDHSA: return "amdhsa";
+  case AMDPAL: return "amdpal";
   case Ananas: return "ananas";
+  case CNK: return "cnk";
+  case CUDA: return "cuda";
   case CloudABI: return "cloudabi";
+  case Contiki: return "contiki";
   case Darwin: return "darwin";
   case DragonFly: return "dragonfly";
+  case ELFIAMCU: return "elfiamcu";
+  case Emscripten: return "emscripten";
   case FreeBSD: return "freebsd";
   case Fuchsia: return "fuchsia";
+  case Haiku: return "haiku";
+  case HermitCore: return "hermit";
+  case Hurd: return "hurd";
   case IOS: return "ios";
   case KFreeBSD: return "kfreebsd";
   case Linux: return "linux";
   case Lv2: return "lv2";
   case MacOSX: return "macosx";
-  case NetBSD: return "netbsd";
-  case OpenBSD: return "openbsd";
-  case Solaris: return "solaris";
-  case Win32: return "windows";
-  case Haiku: return "haiku";
+  case Mesa3D: return "mesa3d";
   case Minix: return "minix";
-  case RTEMS: return "rtems";
-  case NaCl: return "nacl";
-  case CNK: return "cnk";
-  case AIX: return "aix";
-  case CUDA: return "cuda";
   case NVCL: return "nvcl";
-  case AMDHSA: return "amdhsa";
+  case NaCl: return "nacl";
+  case NetBSD: return "netbsd";
+  case OpenBSD: return "openbsd";
   case PS4: return "ps4";
-  case ELFIAMCU: return "elfiamcu";
+  case RTEMS: return "rtems";
+  case Solaris: return "solaris";
   case TvOS: return "tvos";
-  case WatchOS: return "watchos";
-  case Mesa3D: return "mesa3d";
-  case Contiki: return "contiki";
-  case AMDPAL: return "amdpal";
-  case HermitCore: return "hermit";
-  case Hurd: return "hurd";
   case WASI: return "wasi";
-  case Emscripten: return "emscripten";
+  case WatchOS: return "watchos";
+  case Win32: return "windows";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -227,26 +230,26 @@ StringRef Triple::getOSTypeName(OSType Kind) {
 StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   switch (Kind) {
   case UnknownEnvironment: return "unknown";
+  case Android: return "android";
+  case CODE16: return "code16";
+  case CoreCLR: return "coreclr";
+  case Cygnus: return "cygnus";
+  case EABI: return "eabi";
+  case EABIHF: return "eabihf";
   case GNU: return "gnu";
-  case GNUABIN32: return "gnuabin32";
   case GNUABI64: return "gnuabi64";
-  case GNUEABIHF: return "gnueabihf";
+  case GNUABIN32: return "gnuabin32";
   case GNUEABI: return "gnueabi";
+  case GNUEABIHF: return "gnueabihf";
   case GNUX32: return "gnux32";
-  case CODE16: return "code16";
-  case EABI: return "eabi";
-  case EABIHF: return "eabihf";
-  case Android: return "android";
+  case Itanium: return "itanium";
+  case MSVC: return "msvc";
+  case MacABI: return "macabi";
   case Musl: return "musl";
   case MuslEABI: return "musleabi";
   case MuslEABIHF: return "musleabihf";
-  case MSVC: return "msvc";
-  case Itanium: return "itanium";
-  case Cygnus: return "cygnus";
-  case CoreCLR: return "coreclr";
-  case Simulator: return "simulator";
   case SYCLDevice: return "sycldevice";
-  case MacABI: return "macabi";
+  case Simulator: return "simulator";
   }
 
   llvm_unreachable("Invalid EnvironmentType!");
@@ -325,6 +328,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("fpga_aoco", fpga_aoco)
     .Case("fpga_aocr", fpga_aocr)
     .Case("fpga_aocx", fpga_aocx)
+    .Case("ve", ve)
     .Default(UnknownArch);
 }
 
@@ -399,7 +403,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     // FIXME: Do we need to support these?
     .Cases("i786", "i886", "i986", Triple::x86)
     .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
-    .Cases("powerpc", "ppc", "ppc32", Triple::ppc)
+    .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc)
     .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64)
     .Cases("powerpc64le", "ppc64le", Triple::ppc64le)
     .Case("xscale", Triple::arm)
@@ -448,14 +452,15 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .StartsWith("spir", Triple::spir)
     .StartsWith("kalimba", Triple::kalimba)
     .Case("lanai", Triple::lanai)
-    .Case("shave", Triple::shave)
-    .Case("wasm32", Triple::wasm32)
-    .Case("wasm64", Triple::wasm64)
     .Case("renderscript32", Triple::renderscript32)
     .Case("renderscript64", Triple::renderscript64)
     .Case("fpga_aoco", Triple::fpga_aoco)
     .Case("fpga_aocr", Triple::fpga_aocr)
     .Case("fpga_aocx", Triple::fpga_aocx)
+    .Case("shave", Triple::shave)
+    .Case("ve", Triple::ve)
+    .Case("wasm32", Triple::wasm32)
+    .Case("wasm64", Triple::wasm64)
     .Default(Triple::UnknownArch);
 
   // Some architectures require special parsing logic just to compute the
@@ -588,6 +593,9 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     }
   }
 
+  if (SubArchName == "powerpcspe")
+    return Triple::PPCSubArch_spe;
+
   StringRef ARMSubArch = ARM::getCanonicalArchName(SubArchName);
 
   // For now, this is the small part. Early return.
@@ -662,10 +670,10 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
 static StringRef getObjectFormatTypeName(Triple::ObjectFormatType Kind) {
   switch (Kind) {
   case Triple::UnknownObjectFormat: return "";
-  case Triple::COFF: return "coff";
-  case Triple::ELF: return "elf";
+  case Triple::COFF:  return "coff";
+  case Triple::ELF:   return "elf";
   case Triple::MachO: return "macho";
-  case Triple::Wasm: return "wasm";
+  case Triple::Wasm:  return "wasm";
   case Triple::XCOFF: return "xcoff";
   }
   llvm_unreachable("unknown object format type");
@@ -687,28 +695,31 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
     return Triple::ELF;
 
   case Triple::aarch64_be:
-  case Triple::arc:
   case Triple::amdgcn:
-  case Triple::amdil:
   case Triple::amdil64:
+  case Triple::amdil:
+  case Triple::arc:
   case Triple::armeb:
   case Triple::avr:
   case Triple::bpfeb:
   case Triple::bpfel:
+  case Triple::fpga_aoco:
+  case Triple::fpga_aocr:
+  case Triple::fpga_aocx:
   case Triple::hexagon:
-  case Triple::lanai:
-  case Triple::hsail:
   case Triple::hsail64:
+  case Triple::hsail:
   case Triple::kalimba:
+  case Triple::lanai:
   case Triple::le32:
   case Triple::le64:
-  case Triple::mips:
   case Triple::mips64:
   case Triple::mips64el:
+  case Triple::mips:
   case Triple::mipsel:
   case Triple::msp430:
-  case Triple::nvptx:
   case Triple::nvptx64:
+  case Triple::nvptx:
   case Triple::ppc64le:
   case Triple::r600:
   case Triple::renderscript32:
@@ -719,20 +730,18 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::sparc:
   case Triple::sparcel:
   case Triple::sparcv9:
-  case Triple::spir:
   case Triple::spir64:
+  case Triple::spir:
   case Triple::systemz:
   case Triple::tce:
   case Triple::tcele:
   case Triple::thumbeb:
+  case Triple::ve:
   case Triple::xcore:
-  case Triple::fpga_aoco:
-  case Triple::fpga_aocr:
-  case Triple::fpga_aocx:
     return Triple::ELF;
 
-  case Triple::ppc:
   case Triple::ppc64:
+  case Triple::ppc:
     if (T.isOSDarwin())
       return Triple::MachO;
     else if (T.isOSAIX())
@@ -1263,58 +1272,59 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
     return 16;
 
   case llvm::Triple::aarch64_32:
+  case llvm::Triple::amdil:
   case llvm::Triple::arc:
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
+  case llvm::Triple::fpga_aoco:
+  case llvm::Triple::fpga_aocr:
+  case llvm::Triple::fpga_aocx:
   case llvm::Triple::hexagon:
+  case llvm::Triple::hsail:
+  case llvm::Triple::kalimba:
+  case llvm::Triple::lanai:
   case llvm::Triple::le32:
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
   case llvm::Triple::nvptx:
   case llvm::Triple::ppc:
   case llvm::Triple::r600:
+  case llvm::Triple::renderscript32:
   case llvm::Triple::riscv32:
+  case llvm::Triple::shave:
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
+  case llvm::Triple::spir:
   case llvm::Triple::tce:
   case llvm::Triple::tcele:
   case llvm::Triple::thumb:
   case llvm::Triple::thumbeb:
+  case llvm::Triple::wasm32:
   case llvm::Triple::x86:
   case llvm::Triple::xcore:
-  case llvm::Triple::amdil:
-  case llvm::Triple::hsail:
-  case llvm::Triple::spir:
-  case llvm::Triple::kalimba:
-  case llvm::Triple::lanai:
-  case llvm::Triple::shave:
-  case llvm::Triple::wasm32:
-  case llvm::Triple::renderscript32:
-  case llvm::Triple::fpga_aoco:
-  case llvm::Triple::fpga_aocr:
-  case llvm::Triple::fpga_aocx:
     return 32;
 
   case llvm::Triple::aarch64:
   case llvm::Triple::aarch64_be:
   case llvm::Triple::amdgcn:
-  case llvm::Triple::bpfel:
+  case llvm::Triple::amdil64:
   case llvm::Triple::bpfeb:
+  case llvm::Triple::bpfel:
+  case llvm::Triple::hsail64:
   case llvm::Triple::le64:
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
   case llvm::Triple::nvptx64:
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
+  case llvm::Triple::renderscript64:
   case llvm::Triple::riscv64:
   case llvm::Triple::sparcv9:
-  case llvm::Triple::systemz:
-  case llvm::Triple::x86_64:
-  case llvm::Triple::amdil64:
-  case llvm::Triple::hsail64:
   case llvm::Triple::spir64:
+  case llvm::Triple::systemz:
+  case llvm::Triple::ve:
   case llvm::Triple::wasm64:
-  case llvm::Triple::renderscript64:
+  case llvm::Triple::x86_64:
     return 64;
   }
   llvm_unreachable("Invalid architecture value");
@@ -1338,63 +1348,64 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::UnknownArch:
   case Triple::amdgcn:
   case Triple::avr:
-  case Triple::bpfel:
   case Triple::bpfeb:
+  case Triple::bpfel:
   case Triple::msp430:
-  case Triple::systemz:
   case Triple::ppc64le:
+  case Triple::systemz:
+  case Triple::ve:
     T.setArch(UnknownArch);
     break;
 
   case Triple::aarch64_32:
   case Triple::amdil:
-  case Triple::hsail:
-  case Triple::spir:
   case Triple::arc:
   case Triple::arm:
   case Triple::armeb:
+  case Triple::fpga_aoco:
+  case Triple::fpga_aocr:
+  case Triple::fpga_aocx:
   case Triple::hexagon:
+  case Triple::hsail:
   case Triple::kalimba:
+  case Triple::lanai:
   case Triple::le32:
   case Triple::mips:
   case Triple::mipsel:
   case Triple::nvptx:
   case Triple::ppc:
   case Triple::r600:
+  case Triple::renderscript32:
   case Triple::riscv32:
+  case Triple::shave:
   case Triple::sparc:
   case Triple::sparcel:
+  case Triple::spir:
   case Triple::tce:
   case Triple::tcele:
   case Triple::thumb:
   case Triple::thumbeb:
+  case Triple::wasm32:
   case Triple::x86:
   case Triple::xcore:
-  case Triple::lanai:
-  case Triple::shave:
-  case Triple::wasm32:
-  case Triple::renderscript32:
-  case Triple::fpga_aoco:
-  case Triple::fpga_aocr:
-  case Triple::fpga_aocx:
     // Already 32-bit.
     break;
 
   case Triple::aarch64:        T.setArch(Triple::arm);     break;
   case Triple::aarch64_be:     T.setArch(Triple::armeb);   break;
+  case Triple::amdil64:        T.setArch(Triple::amdil);   break;
+  case Triple::hsail64:        T.setArch(Triple::hsail);   break;
   case Triple::le64:           T.setArch(Triple::le32);    break;
   case Triple::mips64:         T.setArch(Triple::mips);    break;
   case Triple::mips64el:       T.setArch(Triple::mipsel);  break;
   case Triple::nvptx64:        T.setArch(Triple::nvptx);   break;
   case Triple::ppc64:          T.setArch(Triple::ppc);     break;
-  case Triple::sparcv9:        T.setArch(Triple::sparc);   break;
+  case Triple::renderscript64: T.setArch(Triple::renderscript32); break;
   case Triple::riscv64:        T.setArch(Triple::riscv32); break;
-  case Triple::x86_64:         T.setArch(Triple::x86);     break;
-  case Triple::amdil64:        T.setArch(Triple::amdil);   break;
-  case Triple::hsail64:        T.setArch(Triple::hsail);   break;
+  case Triple::sparcv9:        T.setArch(Triple::sparc);   break;
   case Triple::spir64:         T.setArch(Triple::spir);    break;
   case Triple::wasm64:         T.setArch(Triple::wasm32);  break;
-  case Triple::renderscript64: T.setArch(Triple::renderscript32); break;
+  case Triple::x86_64:         T.setArch(Triple::x86);     break;
   }
   return T;
 }
@@ -1405,63 +1416,64 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::UnknownArch:
   case Triple::arc:
   case Triple::avr:
+  case Triple::fpga_aoco:
+  case Triple::fpga_aocr:
+  case Triple::fpga_aocx:
   case Triple::hexagon:
   case Triple::kalimba:
   case Triple::lanai:
   case Triple::msp430:
   case Triple::r600:
+  case Triple::shave:
+  case Triple::sparcel:
   case Triple::tce:
   case Triple::tcele:
   case Triple::xcore:
-  case Triple::sparcel:
-  case Triple::shave:
-  case Triple::fpga_aoco:
-  case Triple::fpga_aocr:
-  case Triple::fpga_aocx:
     T.setArch(UnknownArch);
     break;
 
   case Triple::aarch64:
   case Triple::aarch64_be:
-  case Triple::bpfel:
-  case Triple::bpfeb:
-  case Triple::le64:
-  case Triple::amdil64:
   case Triple::amdgcn:
+  case Triple::amdil64:
+  case Triple::bpfeb:
+  case Triple::bpfel:
   case Triple::hsail64:
-  case Triple::spir64:
+  case Triple::le64:
   case Triple::mips64:
   case Triple::mips64el:
   case Triple::nvptx64:
   case Triple::ppc64:
   case Triple::ppc64le:
+  case Triple::renderscript64:
   case Triple::riscv64:
   case Triple::sparcv9:
+  case Triple::spir64:
   case Triple::systemz:
-  case Triple::x86_64:
+  case Triple::ve:
   case Triple::wasm64:
-  case Triple::renderscript64:
+  case Triple::x86_64:
     // Already 64-bit.
     break;
 
   case Triple::aarch64_32:      T.setArch(Triple::aarch64);    break;
+  case Triple::amdil:           T.setArch(Triple::amdil64);    break;
   case Triple::arm:             T.setArch(Triple::aarch64);    break;
   case Triple::armeb:           T.setArch(Triple::aarch64_be); break;
+  case Triple::hsail:           T.setArch(Triple::hsail64);    break;
   case Triple::le32:            T.setArch(Triple::le64);       break;
   case Triple::mips:            T.setArch(Triple::mips64);     break;
   case Triple::mipsel:          T.setArch(Triple::mips64el);   break;
   case Triple::nvptx:           T.setArch(Triple::nvptx64);    break;
   case Triple::ppc:             T.setArch(Triple::ppc64);      break;
-  case Triple::sparc:           T.setArch(Triple::sparcv9);    break;
+  case Triple::renderscript32:  T.setArch(Triple::renderscript64);     break;
   case Triple::riscv32:         T.setArch(Triple::riscv64);    break;
-  case Triple::x86:             T.setArch(Triple::x86_64);     break;
-  case Triple::amdil:           T.setArch(Triple::amdil64);    break;
-  case Triple::hsail:           T.setArch(Triple::hsail64);    break;
+  case Triple::sparc:           T.setArch(Triple::sparcv9);    break;
   case Triple::spir:            T.setArch(Triple::spir64);     break;
   case Triple::thumb:           T.setArch(Triple::aarch64);    break;
   case Triple::thumbeb:         T.setArch(Triple::aarch64_be); break;
   case Triple::wasm32:          T.setArch(Triple::wasm64);     break;
-  case Triple::renderscript32:  T.setArch(Triple::renderscript64);     break;
+  case Triple::x86:             T.setArch(Triple::x86_64);     break;
   }
   return T;
 }
@@ -1487,6 +1499,8 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::nvptx64:
   case Triple::nvptx:
   case Triple::r600:
+  case Triple::renderscript32:
+  case Triple::renderscript64:
   case Triple::riscv32:
   case Triple::riscv64:
   case Triple::shave:
@@ -1497,8 +1511,7 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::x86:
   case Triple::x86_64:
   case Triple::xcore:
-  case Triple::renderscript32:
-  case Triple::renderscript64:
+  case Triple::ve:
 
   // ARM is intentionally unsupported here, changing the architecture would
   // drop any arch suffixes.
@@ -1507,13 +1520,13 @@ Triple Triple::getBigEndianArchVariant() const {
     T.setArch(UnknownArch);
     break;
 
-  case Triple::tcele:   T.setArch(Triple::tce);        break;
   case Triple::aarch64: T.setArch(Triple::aarch64_be); break;
   case Triple::bpfel:   T.setArch(Triple::bpfeb);      break;
   case Triple::mips64el:T.setArch(Triple::mips64);     break;
   case Triple::mipsel:  T.setArch(Triple::mips);       break;
   case Triple::ppc64le: T.setArch(Triple::ppc64);      break;
   case Triple::sparcel: T.setArch(Triple::sparc);      break;
+  case Triple::tcele:   T.setArch(Triple::tce);        break;
   default:
     llvm_unreachable("getBigEndianArchVariant: unknown triple.");
   }
@@ -1539,13 +1552,13 @@ Triple Triple::getLittleEndianArchVariant() const {
     T.setArch(UnknownArch);
     break;
 
-  case Triple::tce:        T.setArch(Triple::tcele);    break;
   case Triple::aarch64_be: T.setArch(Triple::aarch64);  break;
   case Triple::bpfeb:      T.setArch(Triple::bpfel);    break;
   case Triple::mips64:     T.setArch(Triple::mips64el); break;
   case Triple::mips:       T.setArch(Triple::mipsel);   break;
   case Triple::ppc64:      T.setArch(Triple::ppc64le);  break;
   case Triple::sparc:      T.setArch(Triple::sparcel);  break;
+  case Triple::tce:        T.setArch(Triple::tcele);    break;
   default:
     llvm_unreachable("getLittleEndianArchVariant: unknown triple.");
   }
@@ -1575,21 +1588,22 @@ bool Triple::isLittleEndian() const {
   case Triple::nvptx:
   case Triple::ppc64le:
   case Triple::r600:
+  case Triple::renderscript32:
+  case Triple::renderscript64:
   case Triple::riscv32:
   case Triple::riscv64:
   case Triple::shave:
   case Triple::sparcel:
   case Triple::spir64:
   case Triple::spir:
+  case Triple::tcele:
   case Triple::thumb:
+  case Triple::ve:
   case Triple::wasm32:
   case Triple::wasm64:
   case Triple::x86:
   case Triple::x86_64:
   case Triple::xcore:
-  case Triple::tcele:
-  case Triple::renderscript32:
-  case Triple::renderscript64:
     return true;
   default:
     return false;
@@ -1644,10 +1658,10 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   case llvm::Triple::Win32:
     // FIXME: this is invalid for WindowsCE
     return "cortex-a9";
-  case llvm::Triple::MacOSX:
   case llvm::Triple::IOS:
-  case llvm::Triple::WatchOS:
+  case llvm::Triple::MacOSX:
   case llvm::Triple::TvOS:
+  case llvm::Triple::WatchOS:
     if (MArch == "v7k")
       return "cortex-a7";
     break;
@@ -1667,10 +1681,10 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   switch (getOS()) {
   case llvm::Triple::NetBSD:
     switch (getEnvironment()) {
-    case llvm::Triple::GNUEABIHF:
-    case llvm::Triple::GNUEABI:
-    case llvm::Triple::EABIHF:
     case llvm::Triple::EABI:
+    case llvm::Triple::EABIHF:
+    case llvm::Triple::GNUEABI:
+    case llvm::Triple::GNUEABIHF:
       return "arm926ej-s";
     default:
       return "strongarm";
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 97162ae221871..3b8f8a19fe49c 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -349,38 +349,22 @@ bool AArch64ExpandPseudo::expandSetTagLoop(
     MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
-  Register SizeReg = MI.getOperand(0).getReg();
-  Register AddressReg = MI.getOperand(1).getReg();
+  Register SizeReg = MI.getOperand(2).getReg();
+  Register AddressReg = MI.getOperand(3).getReg();
 
   MachineFunction *MF = MBB.getParent();
 
   bool ZeroData = MI.getOpcode() == AArch64::STZGloop;
-  const unsigned OpCode1 =
-      ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex;
-  const unsigned OpCode2 =
+  const unsigned OpCode =
       ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex;
 
-  unsigned Size = MI.getOperand(2).getImm();
-  assert(Size > 0 && Size % 16 == 0);
-  if (Size % (16 * 2) != 0) {
-    BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg)
-        .addReg(AddressReg)
-        .addReg(AddressReg)
-        .addImm(1);
-    Size -= 16;
-  }
-  MachineBasicBlock::iterator I =
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg)
-          .addImm(Size);
-  expandMOVImm(MBB, I, 64);
-
   auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
   auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
 
   MF->insert(++MBB.getIterator(), LoopBB);
   MF->insert(++LoopBB->getIterator(), DoneBB);
 
-  BuildMI(LoopBB, DL, TII->get(OpCode2))
+  BuildMI(LoopBB, DL, TII->get(OpCode))
       .addDef(AddressReg)
       .addReg(AddressReg)
       .addReg(AddressReg)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 39d32863f15b2..c732106014e6c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -170,11 +170,6 @@ static cl::opt<bool>
                          cl::desc("reverse the CSR restore sequence"),
                          cl::init(false), cl::Hidden);
 
-static cl::opt<bool> StackTaggingMergeSetTag(
-    "stack-tagging-merge-settag",
-    cl::desc("merge settag instruction in function epilog"), cl::init(true),
-    cl::Hidden);
-
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
 /// This is the biggest offset to the stack pointer we can encode in aarch64
@@ -485,39 +480,6 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
   return true;
 }
 
-bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
-    MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
-  if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
-    return false;
-
-  if (MBB.empty())
-    return true;
-
-  // Disable combined SP bump if the last instruction is an MTE tag store. It
-  // is almost always better to merge SP adjustment into those instructions.
-  MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
-  MachineBasicBlock::iterator Begin = MBB.begin();
-  while (LastI != Begin) {
-    --LastI;
-    if (LastI->isTransient())
-      continue;
-    if (!LastI->getFlag(MachineInstr::FrameDestroy))
-      break;
-  }
-  switch (LastI->getOpcode()) {
-  case AArch64::STGloop:
-  case AArch64::STZGloop:
-  case AArch64::STGOffset:
-  case AArch64::STZGOffset:
-  case AArch64::ST2GOffset:
-  case AArch64::STZ2GOffset:
-    return false;
-  default:
-    return true;
-  }
-  llvm_unreachable("unreachable");
-}
-
 // Given a load or a store instruction, generate an appropriate unwinding SEH
 // code on Windows.
 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
@@ -1497,7 +1459,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // function.
   if (MF.hasEHFunclets())
     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
-  bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
+  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
   // Assume we can't combine the last pop with the sp restore.
 
   if (!CombineSPBump && PrologueSaveSize != 0) {
@@ -2675,399 +2637,9 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
       .addImm(0);
 }
 
-namespace {
-struct TagStoreInstr {
-  MachineInstr *MI;
-  int64_t Offset, Size;
-  explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
-      : MI(MI), Offset(Offset), Size(Size) {}
-};
-
-class TagStoreEdit {
-  MachineFunction *MF;
-  MachineBasicBlock *MBB;
-  MachineRegisterInfo *MRI;
-  // Tag store instructions that are being replaced.
-  SmallVector<TagStoreInstr, 8> TagStores;
-  // Combined memref arguments of the above instructions.
-  SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
-
-  // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
-  // FrameRegOffset + Size) with the address tag of SP.
-  Register FrameReg;
-  StackOffset FrameRegOffset;
-  int64_t Size;
-  // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
-  Optional<int64_t> FrameRegUpdate;
-  // MIFlags for any FrameReg updating instructions.
-  unsigned FrameRegUpdateFlags;
-
-  // Use zeroing instruction variants.
-  bool ZeroData;
-  DebugLoc DL;
-
-  void emitUnrolled(MachineBasicBlock::iterator InsertI);
-  void emitLoop(MachineBasicBlock::iterator InsertI);
-
-public:
-  TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
-      : MBB(MBB), ZeroData(ZeroData) {
-    MF = MBB->getParent();
-    MRI = &MF->getRegInfo();
-  }
-  // Add an instruction to be replaced. Instructions must be added in the
-  // ascending order of Offset, and have to be adjacent.
-  void addInstruction(TagStoreInstr I) {
-    assert((TagStores.empty() ||
-            TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
-           "Non-adjacent tag store instructions.");
-    TagStores.push_back(I);
-  }
-  void clear() { TagStores.clear(); }
-  // Emit equivalent code at the given location, and erase the current set of
-  // instructions. May skip if the replacement is not profitable. May invalidate
-  // the input iterator and replace it with a valid one.
-  void emitCode(MachineBasicBlock::iterator &InsertI,
-                const AArch64FrameLowering *TFI, bool IsLast);
-};
-
-void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
-  const AArch64InstrInfo *TII =
-      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
-
-  const int64_t kMinOffset = -256 * 16;
-  const int64_t kMaxOffset = 255 * 16;
-
-  Register BaseReg = FrameReg;
-  int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
-  if (BaseRegOffsetBytes < kMinOffset ||
-      BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
-    Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
-    emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
-                    {BaseRegOffsetBytes, MVT::i8}, TII);
-    BaseReg = ScratchReg;
-    BaseRegOffsetBytes = 0;
-  }
-
-  MachineInstr *LastI = nullptr;
-  while (Size) {
-    int64_t InstrSize = (Size > 16) ? 32 : 16;
-    unsigned Opcode =
-        InstrSize == 16
-            ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
-            : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
-    MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
-                          .addReg(AArch64::SP)
-                          .addReg(BaseReg)
-                          .addImm(BaseRegOffsetBytes / 16)
-                          .setMemRefs(CombinedMemRefs);
-    // A store to [BaseReg, #0] should go last for an opportunity to fold the
-    // final SP adjustment in the epilogue.
-    if (BaseRegOffsetBytes == 0)
-      LastI = I;
-    BaseRegOffsetBytes += InstrSize;
-    Size -= InstrSize;
-  }
-
-  if (LastI)
-    MBB->splice(InsertI, MBB, LastI);
-}
-
-void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
-  const AArch64InstrInfo *TII =
-      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
-
-  Register BaseReg = FrameRegUpdate
-                         ? FrameReg
-                         : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
-  Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
-
-  emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
-
-  int64_t LoopSize = Size;
-  // If the loop size is not a multiple of 32, split off one 16-byte store at
-  // the end to fold BaseReg update into.
-  if (FrameRegUpdate && *FrameRegUpdate)
-    LoopSize -= LoopSize % 32;
-  MachineInstr *LoopI =
-      BuildMI(*MBB, InsertI, DL,
-              TII->get(ZeroData ? AArch64::STZGloop : AArch64::STGloop))
-          .addDef(SizeReg)
-          .addDef(BaseReg)
-          .addImm(LoopSize)
-          .addReg(BaseReg)
-          .setMemRefs(CombinedMemRefs);
-  if (FrameRegUpdate)
-    LoopI->setFlags(FrameRegUpdateFlags);
-
-  int64_t ExtraBaseRegUpdate =
-      FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
-  if (LoopSize < Size) {
-    assert(FrameRegUpdate);
-    assert(Size - LoopSize == 16);
-    // Tag 16 more bytes at BaseReg and update BaseReg.
-    BuildMI(*MBB, InsertI, DL,
-            TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
-        .addDef(BaseReg)
-        .addReg(BaseReg)
-        .addReg(BaseReg)
-        .addImm(1 + ExtraBaseRegUpdate / 16)
-        .setMemRefs(CombinedMemRefs)
-        .setMIFlags(FrameRegUpdateFlags);
-  } else if (ExtraBaseRegUpdate) {
-    // Update BaseReg.
-    BuildMI(
-        *MBB, InsertI, DL,
-        TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
-        .addDef(BaseReg)
-        .addReg(BaseReg)
-        .addImm(std::abs(ExtraBaseRegUpdate))
-        .addImm(0)
-        .setMIFlags(FrameRegUpdateFlags);
-  }
-}
-
-// Check if *II is a register update that can be merged into STGloop that ends
-// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
-// end of the loop.
-bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
-                       int64_t Size, int64_t *TotalOffset) {
-  MachineInstr &MI = *II;
-  if ((MI.getOpcode() == AArch64::ADDXri ||
-       MI.getOpcode() == AArch64::SUBXri) &&
-      MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
-    unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
-    int64_t Offset = MI.getOperand(2).getImm() << Shift;
-    if (MI.getOpcode() == AArch64::SUBXri)
-      Offset = -Offset;
-    int64_t AbsPostOffset = std::abs(Offset - Size);
-    const int64_t kMaxOffset =
-        0xFFF; // Max encoding for unshifted ADDXri / SUBXri
-    if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
-      *TotalOffset = Offset;
-      return true;
-    }
-  }
-  return false;
-}
-
-void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
-                  SmallVectorImpl<MachineMemOperand *> &MemRefs) {
-  MemRefs.clear();
-  for (auto &TS : TSE) {
-    MachineInstr *MI = TS.MI;
-    // An instruction without memory operands may access anything. Be
-    // conservative and return an empty list.
-    if (MI->memoperands_empty()) {
-      MemRefs.clear();
-      return;
-    }
-    MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
-  }
-}
-
-void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
-                            const AArch64FrameLowering *TFI, bool IsLast) {
-  if (TagStores.empty())
-    return;
-  TagStoreInstr &FirstTagStore = TagStores[0];
-  TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
-  Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
-  DL = TagStores[0].MI->getDebugLoc();
-
-  unsigned Reg;
-  FrameRegOffset = TFI->resolveFrameOffsetReference(
-      *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
-      /*PreferFP=*/false, /*ForSimm=*/true);
-  FrameReg = Reg;
-  FrameRegUpdate = None;
-
-  mergeMemRefs(TagStores, CombinedMemRefs);
-
-  LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
-             for (const auto &Instr
-                  : TagStores) { dbgs() << "  " << *Instr.MI; });
-
-  // Size threshold where a loop becomes shorter than a linear sequence of
-  // tagging instructions.
-  const int kSetTagLoopThreshold = 176;
-  if (Size < kSetTagLoopThreshold) {
-    if (TagStores.size() < 2)
-      return;
-    emitUnrolled(InsertI);
-  } else {
-    MachineInstr *UpdateInstr = nullptr;
-    int64_t TotalOffset;
-    if (IsLast) {
-      // See if we can merge base register update into the STGloop.
-      // This is done in AArch64LoadStoreOptimizer for "normal" stores,
-      // but STGloop is way too unusual for that, and also it only
-      // realistically happens in function epilogue. Also, STGloop is expanded
-      // before that pass.
-      if (InsertI != MBB->end() &&
-          canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
-                            &TotalOffset)) {
-        UpdateInstr = &*InsertI++;
-        LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
-                          << *UpdateInstr);
-      }
-    }
-
-    if (!UpdateInstr && TagStores.size() < 2)
-      return;
-
-    if (UpdateInstr) {
-      FrameRegUpdate = TotalOffset;
-      FrameRegUpdateFlags = UpdateInstr->getFlags();
-    }
-    emitLoop(InsertI);
-    if (UpdateInstr)
-      UpdateInstr->eraseFromParent();
-  }
-
-  for (auto &TS : TagStores)
-    TS.MI->eraseFromParent();
-}
-
-bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
-                                        int64_t &Size, bool &ZeroData) {
-  MachineFunction &MF = *MI.getParent()->getParent();
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-
-  unsigned Opcode = MI.getOpcode();
-  ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
-              Opcode == AArch64::STZ2GOffset);
-
-  if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
-    if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
-      return false;
-    if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
-      return false;
-    Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
-    Size = MI.getOperand(2).getImm();
-    return true;
-  }
-
-  if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
-    Size = 16;
-  else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
-    Size = 32;
-  else
-    return false;
-
-  if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
-    return false;
-
-  Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
-           16 * MI.getOperand(2).getImm();
-  return true;
-}
-
-// Detect a run of memory tagging instructions for adjacent stack frame slots,
-// and replace them with a shorter instruction sequence:
-// * replace STG + STG with ST2G
-// * replace STGloop + STGloop with STGloop
-// This code needs to run when stack slot offsets are already known, but before
-// FrameIndex operands in STG instructions are eliminated.
-MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
-                                                const AArch64FrameLowering *TFI,
-                                                RegScavenger *RS) {
-  bool FirstZeroData;
-  int64_t Size, Offset;
-  MachineInstr &MI = *II;
-  MachineBasicBlock *MBB = MI.getParent();
-  MachineBasicBlock::iterator NextI = ++II;
-  if (&MI == &MBB->instr_back())
-    return II;
-  if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
-    return II;
-
-  SmallVector<TagStoreInstr, 4> Instrs;
-  Instrs.emplace_back(&MI, Offset, Size);
-
-  constexpr int kScanLimit = 10;
-  int Count = 0;
-  for (MachineBasicBlock::iterator E = MBB->end();
-       NextI != E && Count < kScanLimit; ++NextI) {
-    MachineInstr &MI = *NextI;
-    bool ZeroData;
-    int64_t Size, Offset;
-    // Collect instructions that update memory tags with a FrameIndex operand
-    // and (when applicable) constant size, and whose output registers are dead
-    // (the latter is almost always the case in practice). Since these
-    // instructions effectively have no inputs or outputs, we are free to skip
-    // any non-aliasing instructions in between without tracking used registers.
-    if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
-      if (ZeroData != FirstZeroData)
-        break;
-      Instrs.emplace_back(&MI, Offset, Size);
-      continue;
-    }
-
-    // Only count non-transient, non-tagging instructions toward the scan
-    // limit.
-    if (!MI.isTransient())
-      ++Count;
-
-    // Just in case, stop before the epilogue code starts.
-    if (MI.getFlag(MachineInstr::FrameSetup) ||
-        MI.getFlag(MachineInstr::FrameDestroy))
-      break;
-
-    // Reject anything that may alias the collected instructions.
-    if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
-      break;
-  }
-
-  // New code will be inserted after the last tagging instruction we've found.
-  MachineBasicBlock::iterator InsertI = Instrs.back().MI;
-  InsertI++;
-
-  llvm::stable_sort(Instrs,
-                    [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
-                      return Left.Offset < Right.Offset;
-                    });
-
-  // Make sure that we don't have any overlapping stores.
-  int64_t CurOffset = Instrs[0].Offset;
-  for (auto &Instr : Instrs) {
-    if (CurOffset > Instr.Offset)
-      return NextI;
-    CurOffset = Instr.Offset + Instr.Size;
-  }
-
-  // Find contiguous runs of tagged memory and emit shorter instruction
-  // sequencies for them when possible.
-  TagStoreEdit TSE(MBB, FirstZeroData);
-  Optional<int64_t> EndOffset;
-  for (auto &Instr : Instrs) {
-    if (EndOffset && *EndOffset != Instr.Offset) {
-      // Found a gap.
-      TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
-      TSE.clear();
-    }
-
-    TSE.addInstruction(Instr);
-    EndOffset = Instr.Offset + Instr.Size;
-  }
-
-  TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
-
-  return InsertI;
-}
-} // namespace
-
-void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
-    MachineFunction &MF, RegScavenger *RS = nullptr) const {
-  if (StackTaggingMergeSetTag)
-    for (auto &BB : MF)
-      for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
-        II = tryMergeAdjacentSTG(II, this, RS);
-}
-
-/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
-/// before the update.  This is easily retrieved as it is exactly the offset
-/// that is set in processFunctionBeforeFrameFinalized.
+/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before
+/// the update.  This is easily retrieved as it is exactly the offset that is set
+/// in processFunctionBeforeFrameFinalized.
 int AArch64FrameLowering::getFrameIndexReferencePreferSP(
     const MachineFunction &MF, int FI, unsigned &FrameReg,
     bool IgnoreSPUpdates) const {
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 57a7924fb8f8f..b5719feb6b154 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -77,10 +77,6 @@ class AArch64FrameLowering : public TargetFrameLowering {
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                              RegScavenger *RS) const override;
 
-  void
-  processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
-
   unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
 
   unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
@@ -111,8 +107,6 @@ class AArch64FrameLowering : public TargetFrameLowering {
   int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
                                       int &MinCSFrameIndex,
                                       int &MaxCSFrameIndex) const;
-  bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
-                                                unsigned StackBumpBytes) const;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 0ed2a678c4f01..54f3f7c101324 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3458,8 +3458,6 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
   case AArch64::ST1Fourv1d:
   case AArch64::IRG:
   case AArch64::IRGstack:
-  case AArch64::STGloop:
-  case AArch64::STZGloop:
     return AArch64FrameOffsetCannotUpdate;
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 04a23f31ffd60..f4d340c9f06a0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1514,17 +1514,17 @@ def TAGPstack
 // register / expression for the tagged base pointer of the current function.
 def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;
 
-// Large STG to be expanded into a loop. $sz is the size, $Rn is start address.
-// $Rn_wback is one past the end of the range. $Rm is the loop counter.
+// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address.
+// $Rn_wback is one past the end of the range.
 let isCodeGenOnly=1, mayStore=1 in {
 def STGloop
-    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
-             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
+    : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
+             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
       Sched<[WriteAdr, WriteST]>;
 
 def STZGloop
-    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
-             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
+    : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
+             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
       Sched<[WriteAdr, WriteST]>;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 4a3778a2fd072..14f839cd4f812 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -390,10 +390,6 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
   if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
     return false;
 
-  // If even offset 0 is illegal, we don't want a virtual base register.
-  if (!isFrameOffsetLegal(MI, AArch64::SP, 0))
-    return false;
-
   // The offset likely isn't legal; we want to allocate a virtual base register.
   return true;
 }
@@ -449,17 +445,6 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
   (void)Done;
 }
 
-static Register getScratchRegisterForInstruction(MachineInstr &MI) {
-  // ST*Gloop can only have #fi in op3, and they have a constraint that
-  // op1==op3. Since op1 is early-clobber, it may (and also must) be used as the
-  // scratch register.
-  if (MI.getOpcode() == AArch64::STGloop || MI.getOpcode() == AArch64::STZGloop)
-    return MI.getOperand(1).getReg();
-  else
-    return MI.getMF()->getRegInfo().createVirtualRegister(
-        &AArch64::GPR64RegClass);
-}
-
 void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                               int SPAdj, unsigned FIOperandNum,
                                               RegScavenger *RS) const {
@@ -516,7 +501,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       // in a scratch register.
       Offset = TFI->resolveFrameIndexReference(
           MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
-      Register ScratchReg = getScratchRegisterForInstruction(MI);
+      Register ScratchReg =
+          MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
       emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset,
                       TII);
       BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg)
@@ -545,7 +531,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // If we get here, the immediate doesn't fit into the instruction.  We folded
   // as much as possible above.  Handle the rest, providing a register that is
   // SP+LargeImm.
-  Register ScratchReg = getScratchRegisterForInstruction(MI);
+  Register ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
   emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
   MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index e050a0028eca6..ba61ed726e840 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -125,13 +125,19 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(
     return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand,
                               ZeroData);
 
-  const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other};
-
-  if (Addr.getOpcode() == ISD::FrameIndex) {
-    int FI = cast<FrameIndexSDNode>(Addr)->getIndex();
-    Addr = DAG.getTargetFrameIndex(FI, MVT::i64);
+  if (ObjSize % 32 != 0) {
+    SDNode *St1 = DAG.getMachineNode(
+        ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl,
+        {MVT::i64, MVT::Other},
+        {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain});
+    DAG.setNodeMemRefs(cast<MachineSDNode>(St1), {BaseMemOperand});
+    ObjSize -= 16;
+    Addr = SDValue(St1, 0);
+    Chain = SDValue(St1, 1);
   }
-  SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain};
+
+  const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other};
+  SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain};
   SDNode *St = DAG.getMachineNode(
       ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops);
 
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index bd48e5d846af3..70c9db13f139d 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1223,7 +1223,7 @@ class AArch64Operand : public MCParsedAsmOperand {
       APFloat RealVal(APFloat::IEEEdouble());
       auto StatusOrErr =
           RealVal.convertFromString(Desc->Repr, APFloat::rmTowardZero);
-      if (!StatusOrErr || *StatusOrErr != APFloat::opOK)
+      if (errorToBool(StatusOrErr.takeError()) || *StatusOrErr != APFloat::opOK)
         llvm_unreachable("FP immediate is not exact");
 
       if (getFPImm().bitwiseIsEqual(RealVal))
@@ -2580,7 +2580,7 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
     APFloat RealVal(APFloat::IEEEdouble());
     auto StatusOrErr =
         RealVal.convertFromString(Tok.getString(), APFloat::rmTowardZero);
-    if (!StatusOrErr) {
+    if (errorToBool(StatusOrErr.takeError())) {
       TokError("invalid floating point representation");
       return MatchOperand_ParseFail;
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index d7c211f1ed930..f9983693a99ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -30,6 +30,10 @@ def gi_vop3mods :
     GIComplexOperandMatcher<s32, "selectVOP3Mods">,
     GIComplexPatternEquiv<VOP3Mods>;
 
+def gi_vop3mods_nnan :
+    GIComplexOperandMatcher<s32, "selectVOP3Mods_nnan">,
+    GIComplexPatternEquiv<VOP3Mods_nnan>;
+
 def gi_vop3omods :
     GIComplexOperandMatcher<s32, "selectVOP3OMods">,
     GIComplexPatternEquiv<VOP3OMods>;
@@ -204,3 +208,12 @@ foreach Ty = [i64, p0, p1, p4] in {
 
 def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">,
   GISDNodeXFormEquiv<as_i32timm>;
+
+def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
+  GISDNodeXFormEquiv<NegateImm>;
+
+def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">,
+  GISDNodeXFormEquiv<bitcast_fpimm_to_i32>;
+
+def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">,
+  GISDNodeXFormEquiv<IMMPopCount>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a41c8f1a6a3f3..132c51c9e08f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1684,6 +1684,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_SEXT:
   case TargetOpcode::G_ZEXT:
   case TargetOpcode::G_ANYEXT:
+    if (selectImpl(I, *CoverageInfo))
+      return true;
     return selectG_SZA_EXT(I);
   case TargetOpcode::G_BRCOND:
     return selectG_BRCOND(I);
@@ -1770,6 +1772,20 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
   }};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
+  Register Src;
+  unsigned Mods;
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+  if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
+    return None;
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const {
   // FIXME: Handle clamp and op_sel
@@ -2097,6 +2113,29 @@ void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
   MIB.addImm(CstVal.getValue());
 }
 
+void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
+                                                const MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+  MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
+}
+
+void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
+                                                 const MachineInstr &MI) const {
+  const MachineOperand &Op = MI.getOperand(1);
+  if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
+    MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
+  else {
+    assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+    MIB.addImm(Op.getCImm()->getSExtValue());
+  }
+}
+
+void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
+                                                const MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+  MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
+}
+
 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 45782ab3185ef..0799ace086756 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -132,6 +132,8 @@ class AMDGPUInstructionSelector : public InstructionSelector {
   selectVOP3OMods(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectVOP3Mods(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVOP3Mods_nnan(MachineOperand &Root) const;
 
   InstructionSelector::ComplexRendererFns
   selectVOP3OpSelMods0(MachineOperand &Root) const;
@@ -169,6 +171,15 @@ class AMDGPUInstructionSelector : public InstructionSelector {
   void renderTruncImm32(MachineInstrBuilder &MIB,
                         const MachineInstr &MI) const;
 
+  void renderNegateImm(MachineInstrBuilder &MIB,
+                       const MachineInstr &MI) const;
+
+  void renderBitcastImm(MachineInstrBuilder &MIB,
+                        const MachineInstr &MI) const;
+
+  void renderPopcntImm(MachineInstrBuilder &MIB,
+                       const MachineInstr &MI) const;
+
   bool isInlineImmediate16(int64_t Imm) const;
   bool isInlineImmediate32(int64_t Imm) const;
   bool isInlineImmediate64(int64_t Imm) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 89e52d63af2d2..7e71dbdd12408 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -336,15 +336,10 @@ class Aligned<int Bytes> {
   int MinAlignment = Bytes;
 }
 
-class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>;
-
-class StoreFrag<SDPatternOperator op> : PatFrag <
-  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
->;
-
 class StoreHi16<SDPatternOperator op> : PatFrag <
-  (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)
->;
+  (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)> {
+  let IsStore = 1;
+}
 
 def LoadAddress_constant : AddressSpaceList<[  AddrSpaces.Constant ]>;
 def LoadAddress_global : AddressSpaceList<[  AddrSpaces.Global, AddrSpaces.Constant ]>;
@@ -366,48 +361,6 @@ def StoreAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>;
 
 
 
-class GlobalLoadAddress : CodePatPred<[{
-  auto AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS;
-}]>;
-
-class FlatLoadAddress : CodePatPred<[{
-  const auto AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::GLOBAL_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS;
-}]>;
-
-class GlobalAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-}]>;
-
-class PrivateAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
-}]>;
-
-class LocalAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
-
-class RegionAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
-}]>;
-
-class FlatStoreAddress : CodePatPred<[{
-  const auto AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::GLOBAL_ADDRESS;
-}]>;
-
-// TODO: Remove these when stores to new PatFrag format.
-class PrivateStore <SDPatternOperator op> : StoreFrag <op>, PrivateAddress;
-class LocalStore <SDPatternOperator op> : StoreFrag <op>, LocalAddress;
-class RegionStore <SDPatternOperator op> : StoreFrag <op>, RegionAddress;
-class GlobalStore <SDPatternOperator op> : StoreFrag<op>, GlobalAddress;
-class FlatStore <SDPatternOperator op> : StoreFrag <op>, FlatStoreAddress;
-
-
 foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
 let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
 
@@ -485,6 +438,10 @@ def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr),
   let MemoryVT = i16;
 }
 
+def store_hi16_#as : StoreHi16 <truncstorei16>;
+def truncstorei8_hi16_#as : StoreHi16<truncstorei8>;
+def truncstorei16_hi16_#as : StoreHi16<truncstorei16>;
+
 defm atomic_store_#as : binary_atomic_op<atomic_store>;
 
 } // End let AddressSpaces = ...
@@ -521,18 +478,6 @@ defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
 defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op<AMDGPUatomic_cmp_swap>;
 
 
-def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress;
-def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress;
-
-def store_atomic_global : GlobalStore<atomic_store>;
-def truncstorei8_hi16_global : StoreHi16 <truncstorei8>, GlobalAddress;
-def truncstorei16_hi16_global : StoreHi16 <truncstorei16>, GlobalAddress;
-
-def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
-def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
-def atomic_store_local : LocalStore <atomic_store>;
-
-
 def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
   let IsLoad = 1;
   let IsNonExtLoad = 1;
@@ -557,30 +502,6 @@ def store_align16_local: PatFrag<(ops node:$val, node:$ptr),
   let IsTruncStore = 0;
 }
 
-
-def atomic_store_flat  : FlatStore <atomic_store>;
-def truncstorei8_hi16_flat  : StoreHi16<truncstorei8>, FlatStoreAddress;
-def truncstorei16_hi16_flat : StoreHi16<truncstorei16>, FlatStoreAddress;
-
-
-class local_binary_atomic_op<SDNode atomic_op> :
-  PatFrag<(ops node:$ptr, node:$value),
-    (atomic_op node:$ptr, node:$value), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
-
-class region_binary_atomic_op<SDNode atomic_op> :
-  PatFrag<(ops node:$ptr, node:$value),
-    (atomic_op node:$ptr, node:$value), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
-}]>;
-
-
-def mskor_global : PatFrag<(ops node:$val, node:$ptr),
-                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-}]>;
-
 let AddressSpaces = StoreAddress_local.AddrSpaces in {
 defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>;
 defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
@@ -591,17 +512,6 @@ defm atomic_cmp_swap_region : ternary_atomic_op<atomic_cmp_swap>;
 defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
 }
 
-// Legacy.
-def atomic_cmp_swap_global_noret : PatFrag<
-  (ops node:$ptr, node:$cmp, node:$value),
-  (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
-
-def atomic_cmp_swap_global_ret : PatFrag<
-  (ops node:$ptr, node:$cmp, node:$value),
-  (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
-
 //===----------------------------------------------------------------------===//
 // Misc Pattern Fragments
 //===----------------------------------------------------------------------===//
@@ -827,30 +737,6 @@ class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
   (BIT_ALIGN $src0, $src0, $src1)
 >;
 
-multiclass IntMed3Pat<Instruction med3Inst,
-                 SDPatternOperator min,
-                 SDPatternOperator max,
-                 SDPatternOperator min_oneuse,
-                 SDPatternOperator max_oneuse,
-                 ValueType vt = i32> {
-
-  // This matches 16 permutations of
-  // min(max(a, b), max(min(a, b), c))
-  def : AMDGPUPat <
-  (min (max_oneuse vt:$src0, vt:$src1),
-       (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)),
-  (med3Inst vt:$src0, vt:$src1, vt:$src2)
->;
-
-  // This matches 16 permutations of
-  // max(min(x, y), min(max(x, y), z))
-  def : AMDGPUPat <
-  (max (min_oneuse vt:$src0, vt:$src1),
-       (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
-  (med3Inst $src0, $src1, $src2)
->;
-}
-
 // Special conversion patterns
 
 def cvt_rpi_i32_f32 : PatFrag <
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 326df6bc8fb2b..d5834826fcd8b 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2363,7 +2363,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
 
     APFloat RealVal(APFloat::IEEEdouble());
     auto roundMode = APFloat::rmNearestTiesToEven;
-    if (!RealVal.convertFromString(Num, roundMode)) {
+    if (errorToBool(RealVal.convertFromString(Num, roundMode).takeError())) {
       return MatchOperand_ParseFail;
     }
     if (Negate)
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 1b12550aed88f..691aff4ecbb8a 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1621,8 +1621,8 @@ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo In
   >;
 }
 let SubtargetPredicate = isGFX6GFX7 in {
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, store_atomic_global>;
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, store_atomic_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, atomic_store_global_32>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, atomic_store_global_64>;
 } // End Predicates = isGFX6GFX7
 
 
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index f008b800bd327..f4e50e3a15e9a 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -619,7 +619,7 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
 
 def : GCNPat <
   (int_amdgcn_ds_swizzle i32:$src, timm:$offset16),
-  (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
+  (DS_SWIZZLE_B32 VGPR_32:$src, (as_i16imm $offset16), (i1 0))
 >;
 
 class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
@@ -733,8 +733,8 @@ defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">;
 defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">;
 
 let OtherPredicates = [D16PreservesUnusedBits] in {
-def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
-def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>;
+def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>;
+def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
 }
 
 
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index e106af42deddb..2057cac346d45 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -896,8 +896,8 @@ def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>;
 def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
 }
 
-def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
-def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64, VReg_64>;
+def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, atomic_store_global_32, i32>;
+def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, atomic_store_global_64, i64, VReg_64>;
 
 def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
 def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index f40eece859ee7..cbdf0de44f873 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -295,9 +295,23 @@ class VTX_READ <string name, dag outs, list<dag> pattern>
   let VTXInst = 1;
 }
 
-// FIXME: Deprecated.
-class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress;
+// Legacy.
+def atomic_cmp_swap_global_noret : PatFrag<
+  (ops node:$ptr, node:$cmp, node:$value),
+  (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+
+def atomic_cmp_swap_global_ret : PatFrag<
+  (ops node:$ptr, node:$cmp, node:$value),
+  (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+
+def mskor_global : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
 
+// FIXME: These are deprecated
 class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
                                               (ld_node node:$ptr), [{
   LoadSDNode *L = cast<LoadSDNode>(N);
@@ -319,9 +333,10 @@ def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
-// FIXME: These are deprecated
-def az_extloadi8_local : LocalLoad <az_extloadi8>;
-def az_extloadi16_local : LocalLoad <az_extloadi16>;
+let AddressSpaces = LoadAddress_local.AddrSpaces in {
+def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr)>;
+def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr)>;
+}
 
 class LoadParamFrag <PatFrag load_type> : PatFrag <
   (ops node:$ptr), (load_type node:$ptr),
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 609a345ea18ca..1518beafc7aba 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -579,46 +579,37 @@ def si_setcc_uniform : PatFrag <
 // SDNodes PatFrags for d16 loads
 //===----------------------------------------------------------------------===//
 
-class LoadD16Frag <SDPatternOperator op> : PatFrag<(ops node:$ptr, node:$tied_in), (op node:$ptr, node:$tied_in)>;
-class LocalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, LocalAddress;
-class GlobalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, GlobalLoadAddress;
-class PrivateLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, PrivateAddress;
-class FlatLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, FlatLoadAddress;
-
-def load_d16_hi_local : LocalLoadD16 <SIload_d16_hi>;
-def az_extloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_u8>;
-def sextloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_i8>;
-
-def load_d16_hi_global : GlobalLoadD16 <SIload_d16_hi>;
-def az_extloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_u8>;
-def sextloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_i8>;
-
-def load_d16_hi_private : PrivateLoadD16 <SIload_d16_hi>;
-def az_extloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_u8>;
-def sextloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_i8>;
+class LoadD16Frag <SDPatternOperator op> : PatFrag<
+  (ops node:$ptr, node:$tied_in),
+  (op node:$ptr, node:$tied_in)> {
+  let IsLoad = 1;
+}
 
-def load_d16_hi_flat : FlatLoadD16 <SIload_d16_hi>;
-def az_extloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_u8>;
-def sextloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_i8>;
+foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
 
+def load_d16_hi_#as : LoadD16Frag <SIload_d16_hi>;
 
-def load_d16_lo_local : LocalLoadD16 <SIload_d16_lo>;
-def az_extloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_u8>;
-def sextloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_i8>;
+def az_extloadi8_d16_hi_#as : LoadD16Frag <SIload_d16_hi_u8> {
+  let MemoryVT = i8;
+}
 
-def load_d16_lo_global : GlobalLoadD16 <SIload_d16_lo>;
-def az_extloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_u8>;
-def sextloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_i8>;
+def sextloadi8_d16_hi_#as : LoadD16Frag <SIload_d16_hi_i8> {
+  let MemoryVT = i8;
+}
 
-def load_d16_lo_private : PrivateLoadD16 <SIload_d16_lo>;
-def az_extloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_u8>;
-def sextloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_i8>;
+def load_d16_lo_#as : LoadD16Frag <SIload_d16_lo>;
 
-def load_d16_lo_flat : FlatLoadD16 <SIload_d16_lo>;
-def az_extloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_u8>;
-def sextloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_i8>;
+def az_extloadi8_d16_lo_#as : LoadD16Frag <SIload_d16_lo_u8> {
+  let MemoryVT = i8;
+}
 
+def sextloadi8_d16_lo_#as : LoadD16Frag <SIload_d16_lo_i8> {
+  let MemoryVT = i8;
+}
 
+} // End let AddressSpaces = ...
+} // End foreach AddrSpace
 
 def lshr_rev : PatFrag <
   (ops node:$src1, node:$src0),
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index dcc139a9fe943..d84720f820ee3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1930,9 +1930,22 @@ def : GCNPat <
 // TODO: Also do for 64-bit.
 def : GCNPat<
   (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
-  (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
+  (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1)
 >;
 
+def : GCNPat<
+  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
+  let SubtargetPredicate = HasAddNoCarryInsts;
+}
+
+def : GCNPat<
+  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (V_SUB_I32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
+  let SubtargetPredicate = NotHasAddNoCarryInsts;
+}
+
+
 // Avoid pointlessly materializing a constant in VGPR.
 // FIXME: Should also do this for readlane, but tablegen crashes on
 // the ignored src1.
@@ -1959,6 +1972,29 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
 defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
 
+multiclass IntMed3Pat<Instruction med3Inst,
+                 SDPatternOperator min,
+                 SDPatternOperator max,
+                 SDPatternOperator min_oneuse,
+                 SDPatternOperator max_oneuse> {
+
+  // This matches 16 permutations of
+  // min(max(a, b), max(min(a, b), c))
+  def : AMDGPUPat <
+  (min (max_oneuse i32:$src0, i32:$src1),
+       (max_oneuse (min_oneuse i32:$src0, i32:$src1), i32:$src2)),
+  (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
+>;
+
+  // This matches 16 permutations of
+  // max(min(x, y), min(max(x, y), z))
+  def : AMDGPUPat <
+  (max (min_oneuse i32:$src0, i32:$src1),
+       (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
+  (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
+>;
+}
+
 defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
 defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
 
@@ -1989,22 +2025,21 @@ multiclass Int16Med3Pat<Instruction med3Inst,
                    SDPatternOperator min,
                    SDPatternOperator max,
                    SDPatternOperator max_oneuse,
-                   SDPatternOperator min_oneuse,
-                   ValueType vt = i16> {
+                   SDPatternOperator min_oneuse> {
   // This matches 16 permutations of
   // max(min(x, y), min(max(x, y), z))
   def : GCNPat <
-  (max (min_oneuse vt:$src0, vt:$src1),
-       (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
-  (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
+  (max (min_oneuse i16:$src0, i16:$src1),
+       (min_oneuse (max_oneuse i16:$src0, i16:$src1), i16:$src2)),
+  (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE)
 >;
 
   // This matches 16 permutations of
   // min(max(a, b), max(min(a, b), c))
   def : GCNPat <
-  (min (max_oneuse vt:$src0, vt:$src1),
-      (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)),
-  (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
+  (min (max_oneuse i16:$src0, i16:$src1),
+      (max_oneuse (min_oneuse i16:$src0, i16:$src1), i16:$src2)),
+  (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE)
 >;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 064b26665542f..aaadc3dbc7215 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -729,7 +729,7 @@ multiclass Arithmetic_i16_0Hi_Pats <SDPatternOperator op, Instruction inst> {
 
 def : GCNPat<
   (i32 (zext (op i16:$src0, i16:$src1))),
-  (inst $src0, $src1)
+  (inst VSrc_b16:$src0, VSrc_b16:$src1)
 >;
 
 def : GCNPat<
@@ -771,7 +771,7 @@ let Predicates = [Has16BitInsts] in {
 // TODO: Also do for 64-bit.
 def : GCNPat<
   (add i16:$src0, (i16 NegSubInlineConst16:$src1)),
-  (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
+  (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineConst16:$src1)
 >;
 
 
@@ -779,7 +779,7 @@ let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
 
 def : GCNPat<
   (i32 (zext (add i16:$src0, (i16 NegSubInlineConst16:$src1)))),
-  (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
+  (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineConst16:$src1)
 >;
 
 defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 31a98d86a54d2..6c45eecf0c23d 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -19,6 +19,22 @@
 /// which determines whether we can generated the tail-predicated low-overhead
 /// loop form.
 ///
+/// Assumptions and Dependencies:
+/// Low-overhead loops are constructed and executed using a setup instruction:
+/// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
+/// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
+/// but fixed polarity: WLS can only branch forwards and LE can only branch
+/// backwards. These restrictions mean that this pass is dependent upon block
+/// layout and block sizes, which is why it's the last pass to run. The same is
+/// true for ConstantIslands, but this pass does not increase the size of the
+/// basic blocks, nor does it change the CFG. Instructions are mainly removed
+/// during the transform and pseudo instructions are replaced by real ones. In
+/// some cases, when we have to revert to a 'normal' loop, we have to introduce
+/// multiple instructions for a single pseudo (see RevertWhile and
+/// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd
+/// are defined to be as large as this maximum sequence of replacement
+/// instructions.
+///
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 41ad8b0c04de4..7ff05034c1f25 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1235,6 +1235,11 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   unsigned Cost = 0;
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
+      // Don't unroll vectorised loop. MVE does not benefit from it as much as
+      // scalar code.
+      if (I.getType()->isVectorTy())
+        return;
+
       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
         ImmutableCallSite CS(&I);
         if (const Function *F = CS.getCalledFunction()) {
@@ -1243,10 +1248,6 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
         }
         return;
       }
-      // Don't unroll vectorised loop. MVE does not benefit from it as much as
-      // scalar code.
-      if (I.getType()->isVectorTy())
-        return;
 
       SmallVector<const Value*, 4> Operands(I.value_op_begin(),
                                             I.value_op_end());
diff --git a/llvm/lib/Target/LLVMBuild.txt b/llvm/lib/Target/LLVMBuild.txt
index d6a95a3c67133..7403f7713a9f6 100644
--- a/llvm/lib/Target/LLVMBuild.txt
+++ b/llvm/lib/Target/LLVMBuild.txt
@@ -36,6 +36,7 @@ subdirectories =
  WebAssembly
  X86
  XCore
+ VE
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 1b67e1e55bf78..74192cb20cd05 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -897,6 +897,8 @@ bool PPCMIPeephole::simplifyCode(void) {
           bool Is64Bit = (MI.getOpcode() == PPC::RLWINM8 ||
                           MI.getOpcode() == PPC::RLWINM8_rec);
 
+          Simplified = true;
+
           LLVM_DEBUG(dbgs() << "Replace Instr: ");
           LLVM_DEBUG(MI.dump());
 
@@ -913,9 +915,14 @@ bool PPCMIPeephole::simplifyCode(void) {
             MI.RemoveOperand(3);
             MI.getOperand(2).setImm(0);
             MI.setDesc(TII->get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec));
+            MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
+            if (SrcMI->getOperand(1).isKill()) {
+              MI.getOperand(1).setIsKill(true);
+              SrcMI->getOperand(1).setIsKill(false);
+            } else
+              // About to replace MI.getOperand(1), clear its kill flag.
+              MI.getOperand(1).setIsKill(false);
           }
-          Simplified = true;
-          NumRotatesCollapsed++;
 
           LLVM_DEBUG(dbgs() << "With: ");
           LLVM_DEBUG(MI.dump());
@@ -925,16 +932,7 @@ bool PPCMIPeephole::simplifyCode(void) {
           // than NewME. Otherwise we get a 64 bit value after folding, but MI
           // return a 32 bit value.
 
-          // If FoldingReg has only one use and it it not RLWINM_rec and
-          // RLWINM8_rec, safe to delete its def SrcMI. Otherwise keep it.
-          if (MRI->hasOneNonDBGUse(FoldingReg) &&
-              (SrcMI->getOpcode() == PPC::RLWINM ||
-               SrcMI->getOpcode() == PPC::RLWINM8)) {
-            ToErase = SrcMI;
-            LLVM_DEBUG(dbgs() << "Delete dead instruction: ");
-            LLVM_DEBUG(SrcMI->dump());
-          }
-
+          Simplified = true;
           LLVM_DEBUG(dbgs() << "Converting Instr: ");
           LLVM_DEBUG(MI.dump());
 
@@ -953,12 +951,20 @@ bool PPCMIPeephole::simplifyCode(void) {
             // About to replace MI.getOperand(1), clear its kill flag.
             MI.getOperand(1).setIsKill(false);
 
-          Simplified = true;
-          NumRotatesCollapsed++;
-
           LLVM_DEBUG(dbgs() << "To: ");
           LLVM_DEBUG(MI.dump());
         }
+        if (Simplified) {
+          // If FoldingReg has no non-debug use and it has no implicit def (it
+          // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI.
+          // Otherwise keep it.
+          ++NumRotatesCollapsed;
+          if (MRI->use_nodbg_empty(FoldingReg) && !SrcMI->hasImplicitDef()) {
+            ToErase = SrcMI;
+            LLVM_DEBUG(dbgs() << "Delete dead instruction: ");
+            LLVM_DEBUG(SrcMI->dump());
+          }
+        }
         break;
       }
       }
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index dc19cb0ac3093..77122e62dd5fb 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -151,6 +151,9 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       TargetTriple.isMusl())
     SecurePlt = true;
 
+  if (TargetTriple.getSubArch() == Triple::PPCSubArch_spe)
+    HasSPE = true;
+
   if (HasSPE && IsPPC64)
     report_fatal_error( "SPE is only supported for 32-bit targets.\n", false);
   if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU))
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 97a1eb2f190a9..f070b143d5b4e 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -37,7 +38,9 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
     : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
       TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
       RequireStructuredCFG(false), O0WantsFastISel(false),
-      DefaultOptions(Options), Options(Options) {}
+      DefaultOptions(Options), Options(Options) {
+  MIRF = std::make_unique<MIRFormatter>();
+}
 
 TargetMachine::~TargetMachine() = default;
 
diff --git a/llvm/lib/Target/VE/CMakeLists.txt b/llvm/lib/Target/VE/CMakeLists.txt
new file mode 100644
index 0000000000000..a3eb8bae4ac4a
--- /dev/null
+++ b/llvm/lib/Target/VE/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_TARGET_DEFINITIONS VE.td)
+
+add_llvm_target(VECodeGen
+  VETargetMachine.cpp
+  )
+
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
diff --git a/llvm/lib/Target/VE/LLVMBuild.txt b/llvm/lib/Target/VE/LLVMBuild.txt
new file mode 100644
index 0000000000000..b45efd45c8aca
--- /dev/null
+++ b/llvm/lib/Target/VE/LLVMBuild.txt
@@ -0,0 +1,33 @@
+;===- ./lib/Target/VE/LLVMBuild.txt ----------------------------*- Conf -*--===;
+;
+; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = MCTargetDesc TargetInfo
+
+[component_0]
+type = TargetGroup
+name = VE
+parent = Target
+has_asmparser = 0
+has_asmprinter = 0
+
+[component_1]
+type = Library
+name = VECodeGen
+parent = VE
+required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG 
+                     VEDesc VEInfo Support Target
+add_to_library_groups = VE
diff --git a/llvm/lib/Target/VE/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/VE/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 0000000000000..fa2fefbe47f05
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMVEDesc
+  VEMCTargetDesc.cpp
+  )
diff --git a/llvm/lib/Target/VE/MCTargetDesc/LLVMBuild.txt b/llvm/lib/Target/VE/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 0000000000000..e585042e60bba
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Target/VE/MCTargetDesc/LLVMBuild.txt ---------------*- Conf -*--===;
+;
+; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = VEDesc
+parent = VE
+required_libraries = MC VEInfo Support
+add_to_library_groups = VE
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
new file mode 100644
index 0000000000000..7067f34a016f7
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -0,0 +1,19 @@
+//===-- VEMCTargetDesc.cpp - VE Target Descriptions -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides VE specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "VEMCTargetDesc.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeVETargetMC() {
+  // TODO
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
new file mode 100644
index 0000000000000..a7969042606c0
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
@@ -0,0 +1,27 @@
+//===-- VEMCTargetDesc.h - VE Target Descriptions ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides VE specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_MCTARGETDESC_VEMCTARGETDESC_H
+#define LLVM_LIB_TARGET_VE_MCTARGETDESC_VEMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+#include <memory>
+
+namespace llvm {
+
+class Target;
+Target &getTheVETarget();
+
+} // end llvm namespace
+
+#endif
diff --git a/llvm/lib/Target/VE/TargetInfo/CMakeLists.txt b/llvm/lib/Target/VE/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000000000..0850b0f27bf2a
--- /dev/null
+++ b/llvm/lib/Target/VE/TargetInfo/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_component_library(LLVMVEInfo
+  VETargetInfo.cpp
+  )
diff --git a/llvm/lib/Target/VE/TargetInfo/LLVMBuild.txt b/llvm/lib/Target/VE/TargetInfo/LLVMBuild.txt
new file mode 100644
index 0000000000000..c440132476a4d
--- /dev/null
+++ b/llvm/lib/Target/VE/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Target/VE/TargetInfo/LLVMBuild.txt -----------------*- Conf -*--===;
+;
+; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = VEInfo
+parent = VE
+required_libraries = Support
+add_to_library_groups = VE
diff --git a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
new file mode 100644
index 0000000000000..be68fe7d24291
--- /dev/null
+++ b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- VETargetInfo.cpp - VE Target Implementation -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "VE.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target &llvm::getTheVETarget() {
+  static Target TheVETarget;
+  return TheVETarget;
+}
+
+extern "C" void LLVMInitializeVETargetInfo() {
+  RegisterTarget<Triple::ve, /*HasJIT=*/false> X(getTheVETarget(), "ve",
+                                                 "VE", "VE");
+}
diff --git a/llvm/lib/Target/VE/VE.h b/llvm/lib/Target/VE/VE.h
new file mode 100644
index 0000000000000..51d3e701f8ec0
--- /dev/null
+++ b/llvm/lib/Target/VE/VE.h
@@ -0,0 +1,19 @@
+//===-- VE.h - Top-level interface for VE representation --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// VE back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_VE_H
+#define LLVM_LIB_TARGET_VE_VE_H
+
+#include "MCTargetDesc/VEMCTargetDesc.h"
+
+#endif
diff --git a/llvm/lib/Target/VE/VETargetMachine.cpp b/llvm/lib/Target/VE/VETargetMachine.cpp
new file mode 100644
index 0000000000000..10fe9ba0e7ebc
--- /dev/null
+++ b/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -0,0 +1,62 @@
+//===-- VETargetMachine.cpp - Define TargetMachine for VE -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "VETargetMachine.h"
+#include "VE.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ve"
+
+extern "C" void LLVMInitializeVETarget() {
+  // Register the target.
+  RegisterTargetMachine<VETargetMachine> X(getTheVETarget());
+}
+
+static std::string computeDataLayout(const Triple &T) {
+  // Aurora VE is little endian
+  std::string Ret = "e";
+
+  // Use ELF mangling
+  Ret += "-m:e";
+
+  // Alignments for 64 bit integers.
+  Ret += "-i64:64";
+
+  // VE supports 32 bit and 64 bits integer on registers
+  Ret += "-n32:64";
+
+  // Stack alignment is 64 bits
+  Ret += "-S64";
+
+  return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
+
+/// Create an Aurora VE architecture model
+VETargetMachine::VETargetMachine(
+    const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, bool JIT)
+    : LLVMTargetMachine(
+          T, computeDataLayout(TT), TT, CPU, FS, Options,
+          getEffectiveRelocModel(RM),
+          getEffectiveCodeModel(CM, CodeModel::Small),
+          OL)
+{}
+
+VETargetMachine::~VETargetMachine() {}
diff --git a/llvm/lib/Target/VE/VETargetMachine.h b/llvm/lib/Target/VE/VETargetMachine.h
new file mode 100644
index 0000000000000..ac6089036ff8e
--- /dev/null
+++ b/llvm/lib/Target/VE/VETargetMachine.h
@@ -0,0 +1,31 @@
+//===-- VETargetMachine.h - Define TargetMachine for VE ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the VE specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_VETARGETMACHINE_H
+#define LLVM_LIB_TARGET_VE_VETARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class VETargetMachine : public LLVMTargetMachine {
+public:
+  VETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                  StringRef FS, const TargetOptions &Options,
+                  Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                  CodeGenOpt::Level OL, bool JIT);
+  ~VETargetMachine() override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index 0915a1532df9e..b1d2de29c8965 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -702,6 +702,9 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
           }
 
           Blocks.push_back(SuccMBB);
+
+          // After this, EFLAGS will be recreated before each use.
+          SuccMBB->removeLiveIn(X86::EFLAGS);
         }
     } while (!Blocks.empty());
 
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 13fcf6aa72472..39e2057b1b6ee 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -2824,90 +2825,12 @@ struct AAIsDeadFunction : public AAIsDead {
       bool MayReturn = !NoReturnAA.isAssumedNoReturn();
       if (MayReturn && (!Invoke2CallAllowed || !isa<InvokeInst>(CB)))
         continue;
-      Instruction *I = const_cast<Instruction *>(DeadEndI);
-      BasicBlock *BB = I->getParent();
-      Instruction *SplitPos = I->getNextNode();
-      // TODO: mark stuff before unreachable instructions as dead.
-
-      if (auto *II = dyn_cast<InvokeInst>(I)) {
-        // If we keep the invoke the split position is at the beginning of the
-        // normal desitination block (it invokes a noreturn function after all).
-        BasicBlock *NormalDestBB = II->getNormalDest();
-        SplitPos = &NormalDestBB->front();
-
-        /// Invoke is replaced with a call and unreachable is placed after it if
-        /// the callee is nounwind and noreturn. Otherwise, we keep the invoke
-        /// and only place an unreachable in the normal successor.
-        if (Invoke2CallAllowed) {
-          if (II->getCalledFunction()) {
-            const IRPosition &IPos = IRPosition::callsite_function(*II);
-            const auto &AANoUnw = A.getAAFor<AANoUnwind>(*this, IPos);
-            if (AANoUnw.isAssumedNoUnwind()) {
-              LLVM_DEBUG(dbgs()
-                         << "[AAIsDead] Replace invoke with call inst\n");
-              CallInst *CI = createCallMatchingInvoke(II);
-              CI->insertBefore(II);
-              CI->takeName(II);
-              replaceAllInstructionUsesWith(*II, *CI);
-
-              // If this is a nounwind + mayreturn invoke we only remove the
-              // unwind edge. This is done by moving the invoke into a new and
-              // dead block and connecting the normal destination of the invoke
-              // with a branch that follows the call replacement we created
-              // above.
-              if (MayReturn) {
-                BasicBlock *NewDeadBB =
-                    SplitBlock(BB, II, nullptr, nullptr, nullptr, ".i2c");
-                assert(isa<BranchInst>(BB->getTerminator()) &&
-                       BB->getTerminator()->getNumSuccessors() == 1 &&
-                       BB->getTerminator()->getSuccessor(0) == NewDeadBB);
-                new UnreachableInst(I->getContext(), NewDeadBB);
-                BB->getTerminator()->setOperand(0, NormalDestBB);
-                A.deleteAfterManifest(*II);
-                continue;
-              }
-
-              // We do not need an invoke (II) but instead want a call followed
-              // by an unreachable. However, we do not remove II as other
-              // abstract attributes might have it cached as part of their
-              // results. Given that we modify the CFG anyway, we simply keep II
-              // around but in a new dead block. To avoid II being live through
-              // a different edge we have to ensure the block we place it in is
-              // only reached from the current block of II and then not reached
-              // at all when we insert the unreachable.
-              SplitBlockPredecessors(NormalDestBB, {BB}, ".i2c");
-              SplitPos = CI->getNextNode();
-            }
-          }
-        }
-
-        if (SplitPos == &NormalDestBB->front()) {
-          // If this is an invoke of a noreturn function the edge to the normal
-          // destination block is dead but not necessarily the block itself.
-          // TODO: We need to move to an edge based system during deduction and
-          //       also manifest.
-          assert(!NormalDestBB->isLandingPad() &&
-                 "Expected the normal destination not to be a landingpad!");
-          if (NormalDestBB->getUniquePredecessor() == BB) {
-            assumeLive(A, *NormalDestBB);
-          } else {
-            BasicBlock *SplitBB =
-                SplitBlockPredecessors(NormalDestBB, {BB}, ".dead");
-            // The split block is live even if it contains only an unreachable
-            // instruction at the end.
-            assumeLive(A, *SplitBB);
-            SplitPos = SplitBB->getTerminator();
-            HasChanged = ChangeStatus::CHANGED;
-          }
-        }
-      }
 
-      if (isa_and_nonnull<UnreachableInst>(SplitPos))
-        continue;
-
-      BB = SplitPos->getParent();
-      SplitBlock(BB, SplitPos);
-      A.changeToUnreachableAfterManifest(BB->getTerminator());
+      if (auto *II = dyn_cast<InvokeInst>(DeadEndI))
+        A.registerInvokeWithDeadSuccessor(const_cast<InvokeInst &>(*II));
+      else
+        A.changeToUnreachableAfterManifest(
+            const_cast<Instruction *>(DeadEndI->getNextNode()));
       HasChanged = ChangeStatus::CHANGED;
     }
 
@@ -5668,8 +5591,35 @@ ChangeStatus Attributor::run(Module &M) {
         }
       }
     }
-    for (Instruction *I : ToBeChangedToUnreachableInsts)
-      changeToUnreachable(I, /* UseLLVMTrap */ false);
+    for (auto &V : InvokeWithDeadSuccessor)
+      if (InvokeInst *II = dyn_cast_or_null<InvokeInst>(V)) {
+        bool UnwindBBIsDead = II->hasFnAttr(Attribute::NoUnwind);
+        bool NormalBBIsDead = II->hasFnAttr(Attribute::NoReturn);
+        bool Invoke2CallAllowed =
+            !AAIsDeadFunction::mayCatchAsynchronousExceptions(
+                *II->getFunction());
+        assert((UnwindBBIsDead || NormalBBIsDead) &&
+               "Invoke does not have dead successors!");
+        BasicBlock *BB = II->getParent();
+        BasicBlock *NormalDestBB = II->getNormalDest();
+        if (UnwindBBIsDead) {
+          Instruction *NormalNextIP = &NormalDestBB->front();
+          if (Invoke2CallAllowed) {
+            changeToCall(II);
+            NormalNextIP = BB->getTerminator();
+          }
+          if (NormalBBIsDead)
+            ToBeChangedToUnreachableInsts.insert(NormalNextIP);
+        } else {
+          assert(NormalBBIsDead && "Broken invariant!");
+          if (!NormalDestBB->getUniquePredecessor())
+            NormalDestBB = SplitBlockPredecessors(NormalDestBB, {BB}, ".dead");
+          ToBeChangedToUnreachableInsts.insert(&NormalDestBB->front());
+        }
+      }
+    for (auto &V : ToBeChangedToUnreachableInsts)
+      if (Instruction *I = dyn_cast_or_null<Instruction>(V))
+        changeToUnreachable(I, /* UseLLVMTrap */ false);
     for (Instruction *I : TerminatorsToFold)
       ConstantFoldTerminator(I->getParent());
 
@@ -6336,7 +6286,9 @@ static bool runAttributorOnModule(Module &M, AnalysisGetter &AG) {
     A.identifyDefaultAbstractAttributes(F);
   }
 
-  return A.run(M) == ChangeStatus::CHANGED;
+  bool Changed = A.run(M) == ChangeStatus::CHANGED;
+  assert(!verifyModule(M, &errs()) && "Module verification failed!");
+  return Changed;
 }
 
 PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index f7b39d98d4923..2774e46151faf 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1239,6 +1239,14 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
       Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I);
       return BinaryOperator::CreateFDivFMF(YZ, X, &I);
     }
+    // Z / (1.0 / Y) => (Y * Z)
+    //
+    // This is a special case of Z / (X / Y) => (Y * Z) / X, with X = 1.0. The
+    // m_OneUse check is avoided because even in the case of the multiple uses
+    // for 1.0/Y, the number of instructions remain the same and a division is
+    // replaced by a multiplication.
+    if (match(Op1, m_FDiv(m_SpecificFP(1.0), m_Value(Y))))
+      return BinaryOperator::CreateFMulFMF(Y, Op0, &I);
   }
 
   if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) {
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 558f63113db63..92ad8dafa5abc 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -427,51 +427,76 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   return UnrollResult;
 }
 
+static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI,
+                                  ScalarEvolution &SE,
+                                  const TargetTransformInfo &TTI,
+                                  AssumptionCache &AC, DependenceInfo &DI,
+                                  OptimizationRemarkEmitter &ORE,
+                                  int OptLevel) {
+  bool DidSomething = false;
+
+  // The loop unroll and jam pass requires loops to be in simplified form, and also needs LCSSA.
+  // Since simplification may add new inner loops, it has to run before the
+  // legality and profitability checks. This means running the loop unroll and jam pass
+  // will simplify all loops, regardless of whether anything end up being
+  // unroll and jammed.
+  for (auto &L : LI) {
+    DidSomething |=
+        simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
+    DidSomething |= formLCSSARecursively(*L, DT, &LI, &SE);
+  }
+
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  internal::appendLoopsToWorklist(reverse(LI), Worklist);
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+    formLCSSA(*L, DT, &LI, &SE);
+    LoopUnrollResult Result =
+        tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel);
+    if (Result != LoopUnrollResult::Unmodified)
+      DidSomething = true;
+  }
+
+  return DidSomething;
+}
+
 namespace {
 
-class LoopUnrollAndJam : public LoopPass {
+class LoopUnrollAndJam : public FunctionPass {
 public:
   static char ID; // Pass ID, replacement for typeid
   unsigned OptLevel;
 
-  LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) {
+  LoopUnrollAndJam(int OptLevel = 2) : FunctionPass(ID), OptLevel(OptLevel) {
     initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
   }
 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
-    if (skipLoop(L))
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
       return false;
 
-    Function &F = *L->getHeader()->getParent();
-
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
-    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
-    // pass.  Function analyses need to be preserved across loop transformations
-    // but ORE cannot be preserved (see comment before the pass definition).
-    OptimizationRemarkEmitter ORE(&F);
-
-    LoopUnrollResult Result =
-        tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
+    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
-    if (Result == LoopUnrollResult::FullyUnrolled)
-      LPM.markLoopAsDeleted(*L);
-
-    return Result != LoopUnrollResult::Unmodified;
+    return tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
   }
 
   /// This transformation requires natural loop information & requires that
   /// loop preheaders be inserted into the CFG...
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DependenceAnalysisWrapperPass>();
-    getLoopAnalysisUsage(AU);
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 };
 
@@ -481,10 +506,13 @@ char LoopUnrollAndJam::ID = 0;
 
 INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
                       "Unroll and Jam loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
                     "Unroll and Jam loops", false, false)
 
@@ -492,26 +520,18 @@ Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
   return new LoopUnrollAndJam(OptLevel);
 }
 
-PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM,
-                                            LoopStandardAnalysisResults &AR,
-                                            LPMUpdater &) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
-  // FIXME: This should probably be optional rather than required.
-  if (!ORE)
-    report_fatal_error(
-        "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at "
-        "a higher level");
-
-  DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
-
-  LoopUnrollResult Result = tryToUnrollAndJamLoop(
-      &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel);
-
-  if (Result == LoopUnrollResult::Unmodified)
+PreservedAnalyses LoopUnrollAndJamPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  DependenceInfo &DI = AM.getResult<DependenceAnalysis>(F);
+  OptimizationRemarkEmitter &ORE =
+      AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  if (!tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel))
     return PreservedAnalyses::all();
 
   return getLoopPassPreservedAnalyses();
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index a9566422a8324..0ff6ee8bcfcc2 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -10,9 +10,6 @@
 //
 // TODO:
 //  * Implement multiply & add fusion
-//  * Implement shape propagation
-//  * Implement optimizations to reduce or eliminateshufflevector uses by using
-//    shape information.
 //  * Add remark, summarizing the available matrix optimization opportunities.
 //
 //===----------------------------------------------------------------------===//
@@ -95,20 +92,20 @@ Value *computeColumnAddr(Value *BasePtr, Value *Col, Value *Stride,
   unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
 
   // Compute the start of the column with index Col as Col * Stride.
-  Value *ColumnStart = Builder.CreateMul(Col, Stride);
+  Value *ColumnStart = Builder.CreateMul(Col, Stride, "col.start");
 
   // Get pointer to the start of the selected column. Skip GEP creation,
   // if we select column 0.
   if (isa<ConstantInt>(ColumnStart) && cast<ConstantInt>(ColumnStart)->isZero())
     ColumnStart = BasePtr;
   else
-    ColumnStart = Builder.CreateGEP(EltType, BasePtr, ColumnStart);
+    ColumnStart = Builder.CreateGEP(EltType, BasePtr, ColumnStart, "col.gep");
 
   // Cast elementwise column start pointer to a pointer to a column
   // (EltType x NumRows)*.
   Type *ColumnType = VectorType::get(EltType, NumRows);
   Type *ColumnPtrType = PointerType::get(ColumnType, AS);
-  return Builder.CreatePointerCast(ColumnStart, ColumnPtrType);
+  return Builder.CreatePointerCast(ColumnStart, ColumnPtrType, "col.cast");
 }
 
 /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
@@ -317,36 +314,16 @@ class LowerMatrixIntrinsics {
       default:
         return false;
       }
-    return isUniformShape(V) || isa<StoreInst>(V);
+    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
   }
 
   /// Propagate the shape information of instructions to their users.
-  void propagateShapeForward() {
-    // The work list contains instructions for which we can compute the shape,
-    // either based on the information provided by matrix intrinsics or known
-    // shapes of operands.
-    SmallVector<Instruction *, 8> WorkList;
-
-    // Initialize the work list with ops carrying shape information. Initially
-    // only the shape of matrix intrinsics is known.
-    for (BasicBlock &BB : Func)
-      for (Instruction &Inst : BB) {
-        IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);
-        if (!II)
-          continue;
-
-        switch (II->getIntrinsicID()) {
-        case Intrinsic::matrix_multiply:
-        case Intrinsic::matrix_transpose:
-        case Intrinsic::matrix_columnwise_load:
-        case Intrinsic::matrix_columnwise_store:
-          WorkList.push_back(&Inst);
-          break;
-        default:
-          break;
-        }
-      }
-
+  /// The work list contains instructions for which we can compute the shape,
+  /// either based on the information provided by matrix intrinsics or known
+  /// shapes of operands.
+  SmallVector<Instruction *, 32>
+  propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) {
+    SmallVector<Instruction *, 32> NewWorkList;
     // Pop an element for which we guaranteed to have at least one of the
     // operand shapes.  Add the shape for this and then add users to the work
     // list.
@@ -395,16 +372,120 @@ class LowerMatrixIntrinsics {
         }
       }
 
-      if (Propagate)
+      if (Propagate) {
+        NewWorkList.push_back(Inst);
         for (auto *User : Inst->users())
           if (ShapeMap.count(User) == 0)
             WorkList.push_back(cast<Instruction>(User));
+      }
     }
+
+    return NewWorkList;
+  }
+
+  /// Propagate the shape to operands of instructions with shape information.
+  /// \p Worklist contains the instruction for which we already know the shape.
+  SmallVector<Instruction *, 32>
+  propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) {
+    SmallVector<Instruction *, 32> NewWorkList;
+
+    auto pushInstruction = [](Value *V,
+                              SmallVectorImpl<Instruction *> &WorkList) {
+      Instruction *I = dyn_cast<Instruction>(V);
+      if (I)
+        WorkList.push_back(I);
+    };
+    // Pop an element with known shape.  Traverse the operands, if their shape
+    // derives from the result shape and is unknown, add it and add them to the
+    // worklist.
+    LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n");
+    while (!WorkList.empty()) {
+      Value *V = WorkList.back();
+      WorkList.pop_back();
+
+      size_t BeforeProcessingV = WorkList.size();
+      if (!isa<Instruction>(V))
+        continue;
+
+      Value *MatrixA;
+      Value *MatrixB;
+      Value *M;
+      Value *N;
+      Value *K;
+      if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>(
+                       m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
+                       m_Value(N), m_Value(K)))) {
+        if (setShapeInfo(MatrixA, {M, N}))
+          pushInstruction(MatrixA, WorkList);
+
+        if (setShapeInfo(MatrixB, {N, K}))
+          pushInstruction(MatrixB, WorkList);
+
+      } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>(
+                              m_Value(MatrixA), m_Value(M), m_Value(N)))) {
+        // Flip dimensions.
+        if (setShapeInfo(MatrixA, {M, N}))
+          pushInstruction(MatrixA, WorkList);
+      } else if (match(V, m_Intrinsic<Intrinsic::matrix_columnwise_store>(
+                              m_Value(MatrixA), m_Value(), m_Value(),
+                              m_Value(M), m_Value(N)))) {
+        if (setShapeInfo(MatrixA, {M, N})) {
+          pushInstruction(MatrixA, WorkList);
+        }
+      } else if (isa<LoadInst>(V) ||
+                 match(V, m_Intrinsic<Intrinsic::matrix_columnwise_load>())) {
+        // Nothing to do, no matrix input.
+      } else if (isa<StoreInst>(V)) {
+        // Nothing to do.  We forward-propagated to this so we would just
+        // backward propagate to an instruction with an already known shape.
+      } else if (isUniformShape(V)) {
+        // Propagate to all operands.
+        ShapeInfo Shape = ShapeMap[V];
+        for (Use &U : cast<Instruction>(V)->operands()) {
+          if (setShapeInfo(U.get(), Shape))
+            pushInstruction(U.get(), WorkList);
+        }
+      }
+      // After we discovered new shape info for new instructions in the
+      // worklist, we use their users as seeds for the next round of forward
+      // propagation.
+      for (size_t I = BeforeProcessingV; I != WorkList.size(); I++)
+        for (User *U : WorkList[I]->users())
+          if (isa<Instruction>(U) && V != U)
+            NewWorkList.push_back(cast<Instruction>(U));
+    }
+    return NewWorkList;
   }
 
   bool Visit() {
-    if (EnableShapePropagation)
-      propagateShapeForward();
+    if (EnableShapePropagation) {
+      SmallVector<Instruction *, 32> WorkList;
+
+      // Initially only the shape of matrix intrinsics is known.
+      // Initialize the work list with ops carrying shape information.
+      for (BasicBlock &BB : Func)
+        for (Instruction &Inst : BB) {
+          IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);
+          if (!II)
+            continue;
+
+          switch (II->getIntrinsicID()) {
+          case Intrinsic::matrix_multiply:
+          case Intrinsic::matrix_transpose:
+          case Intrinsic::matrix_columnwise_load:
+          case Intrinsic::matrix_columnwise_store:
+            WorkList.push_back(&Inst);
+            break;
+          default:
+            break;
+          }
+        }
+      // Propagate shapes until nothing changes any longer.
+      while (!WorkList.empty()) {
+        WorkList = propagateShapeForward(WorkList);
+        WorkList = propagateShapeBackward(WorkList);
+      }
+    }
 
     ReversePostOrderTraversal<Function *> RPOT(&Func);
     bool Changed = false;
@@ -419,6 +500,8 @@ class LowerMatrixIntrinsics {
         Value *Op2;
         if (auto *BinOp = dyn_cast<BinaryOperator>(&Inst))
           Changed |= VisitBinaryOperator(BinOp);
+        if (match(&Inst, m_Load(m_Value(Op1))))
+          Changed |= VisitLoad(&Inst, Op1, Builder);
         else if (match(&Inst, m_Store(m_Value(Op1), m_Value(Op2))))
           Changed |= VisitStore(&Inst, Op1, Op2, Builder);
       }
@@ -433,7 +516,7 @@ class LowerMatrixIntrinsics {
   LoadInst *createColumnLoad(Value *ColumnPtr, Type *EltType,
                              IRBuilder<> Builder) {
     unsigned Align = DL.getABITypeAlignment(EltType);
-    return Builder.CreateAlignedLoad(ColumnPtr, Align);
+    return Builder.CreateAlignedLoad(ColumnPtr, Align, "col.load");
   }
 
   StoreInst *createColumnStore(Value *ColumnValue, Value *ColumnPtr,
@@ -474,17 +557,11 @@ class LowerMatrixIntrinsics {
     return true;
   }
 
-  /// Lowers llvm.matrix.columnwise.load.
-  ///
-  /// The intrinsic loads a matrix from memory using a stride between columns.
-  void LowerColumnwiseLoad(CallInst *Inst) {
+  void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride,
+                 ShapeInfo Shape) {
     IRBuilder<> Builder(Inst);
-    Value *Ptr = Inst->getArgOperand(0);
-    Value *Stride = Inst->getArgOperand(1);
     auto VType = cast<VectorType>(Inst->getType());
     Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
-    ShapeInfo Shape(Inst->getArgOperand(2), Inst->getArgOperand(3));
-
     ColumnMatrixTy Result;
     // Distance between start of one column and the start of the next
     for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) {
@@ -498,6 +575,16 @@ class LowerMatrixIntrinsics {
     finalizeLowering(Inst, Result, Builder);
   }
 
+  /// Lowers llvm.matrix.columnwise.load.
+  ///
+  /// The intrinsic loads a matrix from memory using a stride between columns.
+  void LowerColumnwiseLoad(CallInst *Inst) {
+    Value *Ptr = Inst->getArgOperand(0);
+    Value *Stride = Inst->getArgOperand(1);
+    LowerLoad(Inst, Ptr, Stride,
+              {Inst->getArgOperand(2), Inst->getArgOperand(3)});
+  }
+
   void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride,
                   ShapeInfo Shape) {
     IRBuilder<> Builder(Inst);
@@ -693,6 +780,16 @@ class LowerMatrixIntrinsics {
     finalizeLowering(Inst, Result, Builder);
   }
 
+  /// Lower load instructions, if shape information is available.
+  bool VisitLoad(Instruction *Inst, Value *Ptr, IRBuilder<> &Builder) {
+    auto I = ShapeMap.find(Inst);
+    if (I == ShapeMap.end())
+      return false;
+
+    LowerLoad(Inst, Ptr, Builder.getInt32(I->second.NumRows), I->second);
+    return true;
+  }
+
   bool VisitStore(Instruction *Inst, Value *StoredVal, Value *Ptr,
                   IRBuilder<> &Builder) {
     auto I = ShapeMap.find(StoredVal);
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index b7dd3d75e4580..c4c40189fda46 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -714,19 +714,19 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   // To estimate the number of times the loop body was executed, we want to
   // know the number of times the backedge was taken, vs. the number of times
   // we exited the loop.
-  uint64_t TrueVal, FalseVal;
-  if (!LatchBR->extractProfMetadata(TrueVal, FalseVal))
+  uint64_t BackedgeTakenWeight, LatchExitWeight;
+  if (!LatchBR->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
     return None;
 
-  if (!TrueVal || !FalseVal)
+  if (LatchBR->getSuccessor(0) != L->getHeader())
+    std::swap(BackedgeTakenWeight, LatchExitWeight);
+
+  if (!BackedgeTakenWeight || !LatchExitWeight)
     return 0;
 
   // Divide the count of the backedge by the count of the edge exiting the loop,
   // rounding to nearest.
-  if (LatchBR->getSuccessor(0) == L->getHeader())
-    return (TrueVal + (FalseVal / 2)) / FalseVal;
-  else
-    return (FalseVal + (TrueVal / 2)) / TrueVal;
+  return llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight);
 }
 
 bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fd30d52a562a2..0400e44dd0ecf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7502,30 +7502,43 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
 }
 
-static ScalarEpilogueLowering
-getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
-                          ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
-                          TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
-                          AssumptionCache *AC, LoopInfo *LI,
-                          ScalarEvolution *SE, DominatorTree *DT,
-                          const LoopAccessInfo *LAI) {
-  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+// Determine how to lower the scalar epilogue, which depends on 1) optimising
+// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
+// predication, and 4) a TTI hook that analyses whether the loop is suitable
+// for predication.
+static ScalarEpilogueLowering getScalarEpilogueLowering(
+    Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+    AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+    LoopVectorizationLegality &LVL) {
+  bool OptSize =
+      F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                                     PGSOQueryType::IRPass);
+  // 1) OptSize takes precedence over all other options, i.e. if this is set,
+  // don't look at hints or options, and don't request a scalar epilogue.
+  if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
+    return CM_ScalarEpilogueNotAllowedOptSize;
+
   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
                               !PreferPredicateOverEpilog;
 
-  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
-                                   PGSOQueryType::IRPass)))
-    SEL = CM_ScalarEpilogueNotAllowedOptSize;
-  else if (PreferPredicateOverEpilog ||
-           Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
-           (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
-            Hints.getPredicate() != LoopVectorizeHints::FK_Disabled &&
-            !PredicateOptDisabled))
-    SEL = CM_ScalarEpilogueNotNeededUsePredicate;
+  // 2) Next, if disabling predication is requested on the command line, honour
+  // this and request a scalar epilogue. Also do this if we don't have a
+  // primary induction variable, which is required for predication.
+  if (PredicateOptDisabled || !LVL.getPrimaryInduction())
+    return CM_ScalarEpilogueAllowed;
+
+  // 3) and 4) look if enabling predication is requested on the command line,
+  // with a loop hint, or if the TTI hook indicates this is profitable, request
+  // predication .
+  if (PreferPredicateOverEpilog ||
+      Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
+      (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
+                                        LVL.getLAI()) &&
+       Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
+    return CM_ScalarEpilogueNotNeededUsePredicate;
 
-  return SEL;
+  return CM_ScalarEpilogueAllowed;
 }
 
 // Process the loop in the VPlan-native vectorization path. This path builds
@@ -7543,9 +7556,8 @@ static bool processLoopInVPlanNativePath(
   Function *F = L->getHeader()->getParent();
   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
 
-  ScalarEpilogueLowering SEL =
-    getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
-                              PSE.getSE(), DT, LVL->getLAI());
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
 
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
                                 &Hints, IAI);
@@ -7637,9 +7649,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Check the function attributes and profiles to find out if this function
   // should be optimized for size.
-  ScalarEpilogueLowering SEL =
-    getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
-                              PSE.getSE(), DT, LVL.getLAI());
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
 
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
diff --git a/llvm/test/Analysis/ScalarEvolution/range_nw_flag.ll b/llvm/test/Analysis/ScalarEvolution/range_nw_flag.ll
index 65ecd312e0cbb..4a520056d7657 100644
--- a/llvm/test/Analysis/ScalarEvolution/range_nw_flag.ll
+++ b/llvm/test/Analysis/ScalarEvolution/range_nw_flag.ll
@@ -39,3 +39,22 @@ exit:
   ret void
 }
 
+; CHECK-LABEL: @test-addrec-nsw
+; CHECK: -->  {(-1 + (-10 smin %offset))<nsw>,+,-1}<nsw><%loop> U: [-2147483648,1) S: [-2147483648,1)
+define void @test-addrec-nsw(float* %input, i32 %offset, i32 %numIterations) {
+entry:
+  %cmp = icmp slt i32 %offset, -10 
+  %max = select i1 %cmp, i32 %offset, i32 -10 
+  br label %loop
+loop:
+  %i = phi i32 [ %nexti, %loop ], [ 0, %entry ]
+  %nexti = add nsw i32 %i, -1
+  %index32 = add nsw i32 %nexti, %max
+  %ptr = getelementptr inbounds float, float* %input, i32 %index32
+  %f = load float, float* %ptr, align 4
+  %exitcond = icmp eq i32 %nexti, %numIterations
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/settag-merge.ll b/llvm/test/CodeGen/AArch64/settag-merge.ll
deleted file mode 100644
index 1bc93a82070f0..0000000000000
--- a/llvm/test/CodeGen/AArch64/settag-merge.ll
+++ /dev/null
@@ -1,214 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64 -mattr=+mte | FileCheck %s
-
-declare void @use(i8* %p)
-declare void @llvm.aarch64.settag(i8* %p, i64 %a)
-declare void @llvm.aarch64.settag.zero(i8* %p, i64 %a)
-
-define void @stg16_16() {
-entry:
-; CHECK-LABEL: stg16_16:
-; CHECK: st2g sp, [sp], #32
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 16)
-  ret void
-}
-
-define i32 @stg16_16_16_16_ret() {
-entry:
-; CHECK-LABEL: stg16_16_16_16_ret:
-; CHECK: st2g sp, [sp, #32]
-; CHECK: st2g sp, [sp], #64
-; CHECK: mov  w0, wzr
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 16, align 16
-  %c = alloca i8, i32 16, align 16
-  %d = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 16)
-  call void @llvm.aarch64.settag(i8* %c, i64 16)
-  call void @llvm.aarch64.settag(i8* %d, i64 16)
-  ret i32 0
-}
-
-define void @stg16_16_16_16() {
-entry:
-; CHECK-LABEL: stg16_16_16_16:
-; CHECK: st2g sp, [sp, #32]
-; CHECK: st2g sp, [sp], #64
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 16, align 16
-  %c = alloca i8, i32 16, align 16
-  %d = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 16)
-  call void @llvm.aarch64.settag(i8* %c, i64 16)
-  call void @llvm.aarch64.settag(i8* %d, i64 16)
-  ret void
-}
-
-define void @stg128_128_128_128() {
-entry:
-; CHECK-LABEL: stg128_128_128_128:
-; CHECK: mov     x8, #512
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 128, align 16
-  %b = alloca i8, i32 128, align 16
-  %c = alloca i8, i32 128, align 16
-  %d = alloca i8, i32 128, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 128)
-  call void @llvm.aarch64.settag(i8* %b, i64 128)
-  call void @llvm.aarch64.settag(i8* %c, i64 128)
-  call void @llvm.aarch64.settag(i8* %d, i64 128)
-  ret void
-}
-
-define void @stg16_512_16() {
-entry:
-; CHECK-LABEL: stg16_512_16:
-; CHECK: mov     x8, #544
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 512, align 16
-  %c = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 512)
-  call void @llvm.aarch64.settag(i8* %c, i64 16)
-  ret void
-}
-
-define void @stg512_512_512() {
-entry:
-; CHECK-LABEL: stg512_512_512:
-; CHECK: mov     x8, #1536
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 512, align 16
-  %b = alloca i8, i32 512, align 16
-  %c = alloca i8, i32 512, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 512)
-  call void @llvm.aarch64.settag(i8* %b, i64 512)
-  call void @llvm.aarch64.settag(i8* %c, i64 512)
-  ret void
-}
-
-define void @early(i1 %flag) {
-entry:
-; CHECK-LABEL: early:
-; CHECK: tbz     w0, #0, [[LABEL:.LBB.*]]
-; CHECK: st2g    sp, [sp, #
-; CHECK: st2g    sp, [sp, #
-; CHECK: st2g    sp, [sp, #
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
-  %a = alloca i8, i32 48, align 16
-  %b = alloca i8, i32 48, align 16
-  %c = alloca i8, i32 48, align 16
-  br i1 %flag, label %if.then, label %if.end
-
-if.then:
-  call void @llvm.aarch64.settag(i8* %a, i64 48)
-  call void @llvm.aarch64.settag(i8* %b, i64 48)
-  br label %if.end
-
-if.end:
-  call void @llvm.aarch64.settag(i8* %c, i64 48)
-  ret void
-}
-
-define void @early_128_128(i1 %flag) {
-entry:
-; CHECK-LABEL: early_128_128:
-; CHECK: tbz   w0, #0, [[LABEL:.LBB.*]]
-; CHECK: add   x9, sp, #
-; CHECK: mov   x8, #256
-; CHECK: st2g  x9, [x9], #32
-; CHECK: sub   x8, x8, #32
-; CHECK: cbnz  x8,
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
-  %a = alloca i8, i32 128, align 16
-  %b = alloca i8, i32 128, align 16
-  %c = alloca i8, i32 48, align 16
-  br i1 %flag, label %if.then, label %if.end
-
-if.then:
-  call void @llvm.aarch64.settag(i8* %a, i64 128)
-  call void @llvm.aarch64.settag(i8* %b, i64 128)
-  br label %if.end
-
-if.end:
-  call void @llvm.aarch64.settag(i8* %c, i64 48)
-  ret void
-}
-
-define void @early_512_512(i1 %flag) {
-entry:
-; CHECK-LABEL: early_512_512:
-; CHECK: tbz   w0, #0, [[LABEL:.LBB.*]]
-; CHECK: add   x9, sp, #
-; CHECK: mov   x8, #1024
-; CHECK: st2g  x9, [x9], #32
-; CHECK: sub   x8, x8, #32
-; CHECK: cbnz  x8,
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
-  %a = alloca i8, i32 512, align 16
-  %b = alloca i8, i32 512, align 16
-  %c = alloca i8, i32 48, align 16
-  br i1 %flag, label %if.then, label %if.end
-
-if.then:
-  call void @llvm.aarch64.settag(i8* %a, i64 512)
-  call void @llvm.aarch64.settag(i8* %b, i64 512)
-  br label %if.end
-
-if.end:
-  call void @llvm.aarch64.settag(i8* %c, i64 48)
-  ret void
-}
-
-; Two loops of size 256; the second loop updates SP.
-define void @stg128_128_gap_128_128() {
-entry:
-; CHECK-LABEL: stg128_128_gap_128_128:
-; CHECK: mov     x9, sp
-; CHECK: mov     x8, #256
-; CHECK: st2g    x9, [x9], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: mov     x8, #256
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 128, align 16
-  %a2 = alloca i8, i32 128, align 16
-  %b = alloca i8, i32 32, align 16
-  %c = alloca i8, i32 128, align 16
-  %c2 = alloca i8, i32 128, align 16
-  call void @use(i8* %b)
-  call void @llvm.aarch64.settag(i8* %a, i64 128)
-  call void @llvm.aarch64.settag(i8* %a2, i64 128)
-  call void @llvm.aarch64.settag(i8* %c, i64 128)
-  call void @llvm.aarch64.settag(i8* %c2, i64 128)
-  ret void
-}
diff --git a/llvm/test/CodeGen/AArch64/settag-merge.mir b/llvm/test/CodeGen/AArch64/settag-merge.mir
deleted file mode 100644
index dc2a00c7d3d37..0000000000000
--- a/llvm/test/CodeGen/AArch64/settag-merge.mir
+++ /dev/null
@@ -1,83 +0,0 @@
-# RUN: llc -mtriple=aarch64 -mattr=+mte -run-pass=prologepilog %s -o - | FileCheck %s
-
---- |
-  declare void @llvm.aarch64.settag(i8* nocapture writeonly, i64) argmemonly nounwind writeonly "target-features"="+mte"
-  define i32 @stg16_16_16_16_ret() "target-features"="+mte" {
-  entry:
-    %a = alloca i8, i32 16, align 16
-    %b = alloca i8, i32 16, align 16
-    %c = alloca i8, i32 16, align 16
-    %d = alloca i8, i32 16, align 16
-    call void @llvm.aarch64.settag(i8* %a, i64 16)
-    call void @llvm.aarch64.settag(i8* %b, i64 16)
-    call void @llvm.aarch64.settag(i8* %c, i64 16)
-    call void @llvm.aarch64.settag(i8* %d, i64 16)
-    ret i32 0
-  }
-
-  define void @stg16_store_128() "target-features"="+mte" {
-  entry:
-    %a = alloca i8, i32 16, align 16
-    %b = alloca i8, i32 128, align 16
-    call void @llvm.aarch64.settag(i8* %a, i64 16)
-    store i8 42, i8* %a
-    call void @llvm.aarch64.settag(i8* %b, i64 128)
-    ret void
-  }
-
-...
----
-# A sequence of STG with a register copy in the middle.
-# Can be merged into ST2G + ST2G.
-# CHECK-LABEL: name:{{.*}}stg16_16_16_16_ret
-# CHECK-DAG: ST2GOffset $sp, $sp, 2
-# CHECK-DAG: ST2GOffset $sp, $sp, 0
-# CHECK-DAG: $w0 = COPY $wzr
-# CHECK-DAG: RET_ReallyLR implicit killed $w0
-
-name:            stg16_16_16_16_ret
-tracksRegLiveness: true
-stack:
-  - { id: 0, name: a, size: 16, alignment: 16 }
-  - { id: 1, name: b, size: 16, alignment: 16 }
-  - { id: 2, name: c, size: 16, alignment: 16 }
-  - { id: 3, name: d, size: 16, alignment: 16 }
-body:             |
-  bb.0.entry:
-    STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a)
-    STGOffset $sp, %stack.1.b, 0 :: (store 16 into %ir.b)
-    STGOffset $sp, %stack.2.c, 0 :: (store 16 into %ir.c)
-    $w0 = COPY $wzr
-    STGOffset $sp, %stack.3.d, 0 :: (store 16 into %ir.d)
-    RET_ReallyLR implicit killed $w0
-
-...
-
----
-# A store in the middle prevents merging.
-# CHECK-LABEL: name:{{.*}}stg16_store_128
-# CHECK: ST2GOffset $sp, $sp, 2
-# CHECK: ST2GOffset $sp, $sp, 4
-# CHECK: ST2GOffset $sp, $sp, 6
-# CHECK: STGOffset  $sp, $sp, 8
-# CHECK: STRBBui
-# CHECK: ST2GOffset $sp, $sp, 0 
-# CHECK: RET_ReallyLR
-
-name:            stg16_store_128
-tracksRegLiveness: true
-stack:
-  - { id: 0, name: a, size: 16, alignment: 16 }
-  - { id: 1, name: b, size: 128, alignment: 16 }
-body:             |
-  bb.0.entry:
-    STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a)
-    renamable $w8 = MOVi32imm 42
-    ST2GOffset $sp, %stack.1.b, 6 :: (store 32 into %ir.b + 96, align 16)
-    ST2GOffset $sp, %stack.1.b, 4 :: (store 32 into %ir.b + 64, align 16)
-    ST2GOffset $sp, %stack.1.b, 2 :: (store 32 into %ir.b + 32, align 16)
-    STRBBui killed renamable $w8, %stack.0.a, 0 :: (store 1 into %ir.a, align 16)
-    ST2GOffset $sp, %stack.1.b, 0 :: (store 32 into %ir.b, align 16)
-    RET_ReallyLR
-
-...
diff --git a/llvm/test/CodeGen/AArch64/settag.ll b/llvm/test/CodeGen/AArch64/settag.ll
index 3deeb0155fe87..9ca188fbce325 100644
--- a/llvm/test/CodeGen/AArch64/settag.ll
+++ b/llvm/test/CodeGen/AArch64/settag.ll
@@ -64,8 +64,8 @@ entry:
 define void @stg17(i8* %p) {
 entry:
 ; CHECK-LABEL: stg17:
-; CHECK: stg x0, [x0], #16
 ; CHECK: mov  {{(w|x)}}[[R:[0-9]+]], #256
+; CHECK: stg x0, [x0], #16
 ; CHECK: st2g x0, [x0], #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
@@ -87,8 +87,8 @@ entry:
 define void @stzg17(i8* %p) {
 entry:
 ; CHECK-LABEL: stzg17:
-; CHECK: stzg x0, [x0], #16
 ; CHECK: mov  {{w|x}}[[R:[0-9]+]], #256
+; CHECK: stzg x0, [x0], #16
 ; CHECK: stz2g x0, [x0], #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
@@ -110,10 +110,10 @@ entry:
 define void @stg_alloca5() {
 entry:
 ; CHECK-LABEL: stg_alloca5:
-; CHECK:         st2g    sp, [sp, #32]
-; CHECK-NEXT:    stg     sp, [sp, #64]
-; CHECK-NEXT:    st2g    sp, [sp], #80
-; CHECK-NEXT:    ret
+; CHECK: stg  sp, [sp, #64]
+; CHECK: st2g sp, [sp, #32]
+; CHECK: st2g sp, [sp]
+; CHECK: ret
   %a = alloca i8, i32 80, align 16
   call void @llvm.aarch64.settag(i8* %a, i64 80)
   ret void
@@ -122,11 +122,12 @@ entry:
 define void @stg_alloca17() {
 entry:
 ; CHECK-LABEL: stg_alloca17:
+; CHECK: mov [[P:x[0-9]+]], sp
+; CHECK: stg [[P]], {{\[}}[[P]]{{\]}}, #16
 ; CHECK: mov  {{w|x}}[[R:[0-9]+]], #256
-; CHECK: st2g sp, [sp], #32
+; CHECK: st2g [[P]], {{\[}}[[P]]{{\]}}, #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
-; CHECK: stg sp, [sp], #16
 ; CHECK: ret
   %a = alloca i8, i32 272, align 16
   call void @llvm.aarch64.settag(i8* %a, i64 272)
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
index ed6ccc8b49413..200837dabfe0e 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
@@ -210,10 +210,11 @@ entry:
 ; DEFAULT:  ldrb [[A:w.*]], [x{{.*}}]
 ; DEFAULT:  ldrb [[B:w.*]], [x{{.*}}]
 
-; ALWAYS-DAG: ldg [[PA:x.*]], [x{{.*}}]
-; ALWAYS-DAG: ldrb [[B:w.*]], [sp]
-; ALWAYS-DAG: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}}
+; ALWAYS: ldg [[PA:x.*]], [x{{.*}}]
+; ALWAYS: ldrb [[B:w.*]], [sp]
+; ALWAYS: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}}
 
+; COMMON: add w0, [[B]], [[A]]
 ; COMMON: ret
 
 ; One of these allocas is closer to FP than to SP, and within 256 bytes
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir
index 43931132107aa..b0f9cc52ae144 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir
@@ -53,3 +53,116 @@ body: |
     S_ENDPGM 0, implicit %9
 
 ...
+
+---
+name:            add_neg_inline_const_64_to_sub_s32_s
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GFX6-LABEL: name: add_neg_inline_const_64_to_sub_s32_s
+    ; GFX6: liveins: $sgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX6: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY]], 64, implicit-def $scc
+    ; GFX6: S_ENDPGM 0, implicit [[S_SUB_I32_]]
+    ; GFX9-LABEL: name: add_neg_inline_const_64_to_sub_s32_s
+    ; GFX9: liveins: $sgpr0
+    ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX9: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY]], 64, implicit-def $scc
+    ; GFX9: S_ENDPGM 0, implicit [[S_SUB_I32_]]
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = G_CONSTANT i32 -64
+    %2:sgpr(s32) = G_ADD %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:            add_neg_inline_const_64_to_sub_s32_v
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX6-LABEL: name: add_neg_inline_const_64_to_sub_s32_v
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967232, implicit $exec
+    ; GFX6: %2:vgpr_32, dead %3:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit %2
+    ; GFX9-LABEL: name: add_neg_inline_const_64_to_sub_s32_v
+    ; GFX9: liveins: $vgpr0
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[COPY]], 64, 0, implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_SUB_U32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = G_CONSTANT i32 -64
+    %2:vgpr(s32) = G_ADD %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:            add_neg_inline_const_16_to_sub_s32_s
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GFX6-LABEL: name: add_neg_inline_const_16_to_sub_s32_s
+    ; GFX6: liveins: $sgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+    ; GFX6: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
+    ; GFX6: S_ENDPGM 0, implicit [[S_ADD_U32_]]
+    ; GFX9-LABEL: name: add_neg_inline_const_16_to_sub_s32_s
+    ; GFX9: liveins: $sgpr0
+    ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+    ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
+    ; GFX9: S_ENDPGM 0, implicit [[S_ADD_U32_]]
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = G_CONSTANT i32 16
+    %2:sgpr(s32) = G_ADD %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:            add_neg_inline_const_16_to_sub_s32_v
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX6-LABEL: name: add_neg_inline_const_16_to_sub_s32_v
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
+    ; GFX6: %2:vgpr_32, dead %3:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit %2
+    ; GFX9-LABEL: name: add_neg_inline_const_16_to_sub_s32_v
+    ; GFX9: liveins: $vgpr0
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
+    ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = G_CONSTANT i32 16
+    %2:vgpr(s32) = G_ADD %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
new file mode 100644
index 0000000000000..cc48e9126c9b7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
@@ -0,0 +1,132 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+
+# Note: 16-bit instructions generally produce a 0 result in the high 16-bits on GFX8 and GFX9 and preserve high 16 bits on GFX10+
+
+---
+name:            add_s16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX6-LABEL: name: add_s16
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+    ; GFX10-LABEL: name: add_s16
+    ; GFX10: liveins: $vgpr0, $vgpr1
+    ; GFX10: $vcc_hi = IMPLICIT_DEF
+    ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vgpr(s16) = G_ADD %2, %3
+    S_ENDPGM 0, implicit %4
+
+...
+
+---
+name:            add_s16_zext_to_s32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX6-LABEL: name: add_s16_zext_to_s32
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+    ; GFX10-LABEL: name: add_s16_zext_to_s32
+    ; GFX10: liveins: $vgpr0, $vgpr1
+    ; GFX10: $vcc_hi = IMPLICIT_DEF
+    ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_ADD_U16_e64_]], 0, 16, implicit $exec
+    ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vgpr(s16) = G_ADD %2, %3
+    %5:vgpr(s32) = G_ZEXT %4
+    S_ENDPGM 0, implicit %5
+
+...
+
+---
+name:            add_s16_neg_inline_const_64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX6-LABEL: name: add_s16_neg_inline_const_64
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+    ; GFX10-LABEL: name: add_s16_neg_inline_const_64
+    ; GFX10: liveins: $vgpr0
+    ; GFX10: $vcc_hi = IMPLICIT_DEF
+    ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX10: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s16) = G_TRUNC %0
+    %2:vgpr(s16) = G_CONSTANT i16 -64
+    %3:vgpr(s16) = G_ADD %1, %2
+    S_ENDPGM 0, implicit %3
+
+...
+
+---
+name:            add_s16_neg_inline_const_64_zext_to_s32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX6-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+    ; GFX10-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
+    ; GFX10: liveins: $vgpr0
+    ; GFX10: $vcc_hi = IMPLICIT_DEF
+    ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_SUB_U16_e64_]], 0, 16, implicit $exec
+    ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s16) = G_TRUNC %0
+    %2:vgpr(s16) = G_CONSTANT i16 -64
+    %3:vgpr(s16) = G_ADD %1, %2
+    %4:vgpr(s32) = G_ZEXT %3
+    S_ENDPGM 0, implicit %4
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
index b3402a6051488..8d3d677b3c007 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
@@ -168,14 +168,12 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX8: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_ASHRREV_I16_e64_]], 0, 16, implicit $exec
-    ; GFX8: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX8: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
     ; GFX9-LABEL: name: ashr_s16_s16_vv_zext_to_s32
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX9: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_ASHRREV_I16_e64_]], 0, 16, implicit $exec
-    ; GFX9: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX9: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
     ; GFX10-LABEL: name: ashr_s16_s16_vv_zext_to_s32
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
index 866ab39fe2d19..f28e35669357b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
@@ -168,14 +168,12 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX8: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHRREV_B16_e64_]], 0, 16, implicit $exec
-    ; GFX8: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX8: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
     ; GFX9-LABEL: name: lshr_s16_s16_vv_zext_to_s32
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX9: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHRREV_B16_e64_]], 0, 16, implicit $exec
-    ; GFX9: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX9: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
     ; GFX10-LABEL: name: lshr_s16_s16_vv_zext_to_s32
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.mir
new file mode 100644
index 0000000000000..b09abd4be1ad7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.mir
@@ -0,0 +1,140 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s
+
+---
+name: smed3_s32_vvv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: smed3_s32_vvv
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MED3_I32_:%[0-9]+]]:vgpr_32 = V_MED3_I32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MED3_I32_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_SMAX %0, %1
+    %4:vgpr(s32) = G_SMIN %0, %1
+    %5:vgpr(s32) = G_SMAX %4, %2
+    %6:vgpr(s32) = G_SMIN %3, %5
+    S_ENDPGM 0, implicit %6
+...
+
+---
+
+name: smed3_s32_sss
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+
+    ; GFX6-LABEL: name: smed3_s32_sss
+    ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+    ; GFX6: [[S_MAX_I32_:%[0-9]+]]:sreg_32 = S_MAX_I32 [[COPY]], [[COPY1]], implicit-def $scc
+    ; GFX6: [[S_MIN_I32_:%[0-9]+]]:sreg_32 = S_MIN_I32 [[COPY]], [[COPY1]], implicit-def $scc
+    ; GFX6: [[S_MAX_I32_1:%[0-9]+]]:sreg_32 = S_MAX_I32 [[S_MIN_I32_]], [[COPY2]], implicit-def $scc
+    ; GFX6: [[S_MIN_I32_1:%[0-9]+]]:sreg_32 = S_MIN_I32 [[S_MAX_I32_]], [[S_MAX_I32_1]], implicit-def $scc
+    ; GFX6: S_ENDPGM 0, implicit [[S_MIN_I32_1]]
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = COPY $sgpr1
+    %2:sgpr(s32) = COPY $sgpr2
+    %3:sgpr(s32) = G_SMAX %0, %1
+    %4:sgpr(s32) = G_SMIN %0, %1
+    %5:sgpr(s32) = G_SMAX %4, %2
+    %6:sgpr(s32) = G_SMIN %3, %5
+    S_ENDPGM 0, implicit %6
+...
+
+---
+name: smed3_s32_vvv_multiuse0
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: smed3_s32_vvv_multiuse0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_MIN_I32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[V_MAX_I32_e64_]], [[V_MAX_I32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_I32_e64_1]], implicit [[V_MAX_I32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_SMAX %0, %1
+    %4:vgpr(s32) = G_SMIN %0, %1
+    %5:vgpr(s32) = G_SMAX %4, %2
+    %6:vgpr(s32) = G_SMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %3
+...
+
+---
+name: smed3_s32_vvv_multiuse1
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: smed3_s32_vvv_multiuse1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_MIN_I32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[V_MAX_I32_e64_]], [[V_MAX_I32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_I32_e64_1]], implicit [[V_MIN_I32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_SMAX %0, %1
+    %4:vgpr(s32) = G_SMIN %0, %1
+    %5:vgpr(s32) = G_SMAX %4, %2
+    %6:vgpr(s32) = G_SMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %4
+...
+
+---
+name: smed3_s32_vvv_multiuse2
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: smed3_s32_vvv_multiuse2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_MIN_I32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[V_MAX_I32_e64_]], [[V_MAX_I32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_I32_e64_1]], implicit [[V_MAX_I32_e64_1]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_SMAX %0, %1
+    %4:vgpr(s32) = G_SMIN %0, %1
+    %5:vgpr(s32) = G_SMAX %4, %2
+    %6:vgpr(s32) = G_SMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
new file mode 100644
index 0000000000000..9e029ee5e066c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
@@ -0,0 +1,168 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s
+
+---
+name: smed3_s16_vvv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: smed3_s16_vvv
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]]
+    ; GFX9-LABEL: name: smed3_s16_vvv
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MED3_I16_:%[0-9]+]]:vgpr_32 = V_MED3_I16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MED3_I16_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_SMAX %3, %4
+    %7:vgpr(s16) = G_SMIN %3, %4
+    %8:vgpr(s16) = G_SMAX %7, %5
+    %9:vgpr(s16) = G_SMIN %6, %8
+
+    S_ENDPGM 0, implicit %9
+...
+
+---
+name: smed3_s16_vvv_multiuse0
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: smed3_s16_vvv_multiuse0
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_]]
+    ; GFX9-LABEL: name: smed3_s16_vvv_multiuse0
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_SMAX %3, %4
+    %7:vgpr(s16) = G_SMIN %3, %4
+    %8:vgpr(s16) = G_SMAX %7, %5
+    %9:vgpr(s16) = G_SMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %6
+...
+
+---
+name: smed3_s16_vvv_multiuse1
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: smed3_s16_vvv_multiuse1
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MIN_I16_e64_]]
+    ; GFX9-LABEL: name: smed3_s16_vvv_multiuse1
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MIN_I16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_SMAX %3, %4
+    %7:vgpr(s16) = G_SMIN %3, %4
+    %8:vgpr(s16) = G_SMAX %7, %5
+    %9:vgpr(s16) = G_SMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %7
+...
+
+---
+name: smed3_s16_vvv_multiuse2
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: smed3_s16_vvv_multiuse2
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_1]]
+    ; GFX9-LABEL: name: smed3_s16_vvv_multiuse2
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_1]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_SMAX %3, %4
+    %7:vgpr(s16) = G_SMIN %3, %4
+    %8:vgpr(s16) = G_SMAX %7, %5
+    %9:vgpr(s16) = G_SMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %8
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.mir
new file mode 100644
index 0000000000000..a8341251faf64
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.mir
@@ -0,0 +1,140 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s
+
+---
+name: umed3_s32_vvv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: umed3_s32_vvv
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MED3_U32_:%[0-9]+]]:vgpr_32 = V_MED3_U32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MED3_U32_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_UMAX %0, %1
+    %4:vgpr(s32) = G_UMIN %0, %1
+    %5:vgpr(s32) = G_UMAX %4, %2
+    %6:vgpr(s32) = G_UMIN %3, %5
+    S_ENDPGM 0, implicit %6
+...
+
+---
+
+name: umed3_s32_sss
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+
+    ; GFX6-LABEL: name: umed3_s32_sss
+    ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+    ; GFX6: [[S_MAX_U32_:%[0-9]+]]:sreg_32 = S_MAX_U32 [[COPY]], [[COPY1]], implicit-def $scc
+    ; GFX6: [[S_MIN_U32_:%[0-9]+]]:sreg_32 = S_MIN_U32 [[COPY]], [[COPY1]], implicit-def $scc
+    ; GFX6: [[S_MAX_U32_1:%[0-9]+]]:sreg_32 = S_MAX_U32 [[S_MIN_U32_]], [[COPY2]], implicit-def $scc
+    ; GFX6: [[S_MIN_U32_1:%[0-9]+]]:sreg_32 = S_MIN_U32 [[S_MAX_U32_]], [[S_MAX_U32_1]], implicit-def $scc
+    ; GFX6: S_ENDPGM 0, implicit [[S_MIN_U32_1]]
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = COPY $sgpr1
+    %2:sgpr(s32) = COPY $sgpr2
+    %3:sgpr(s32) = G_UMAX %0, %1
+    %4:sgpr(s32) = G_UMIN %0, %1
+    %5:sgpr(s32) = G_UMAX %4, %2
+    %6:sgpr(s32) = G_UMIN %3, %5
+    S_ENDPGM 0, implicit %6
+...
+
+---
+name: umed3_s32_vvv_multiuse0
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: umed3_s32_vvv_multiuse0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_U32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[V_MIN_U32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_MAX_U32_e64_]], [[V_MAX_U32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_U32_e64_1]], implicit [[V_MAX_U32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_UMAX %0, %1
+    %4:vgpr(s32) = G_UMIN %0, %1
+    %5:vgpr(s32) = G_UMAX %4, %2
+    %6:vgpr(s32) = G_UMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %3
+...
+
+---
+name: umed3_s32_vvv_multiuse1
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: umed3_s32_vvv_multiuse1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_U32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[V_MIN_U32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_MAX_U32_e64_]], [[V_MAX_U32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_U32_e64_1]], implicit [[V_MIN_U32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_UMAX %0, %1
+    %4:vgpr(s32) = G_UMIN %0, %1
+    %5:vgpr(s32) = G_UMAX %4, %2
+    %6:vgpr(s32) = G_UMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %4
+...
+
+---
+name: umed3_s32_vvv_multiuse2
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: umed3_s32_vvv_multiuse2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_U32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[V_MIN_U32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_MAX_U32_e64_]], [[V_MAX_U32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_U32_e64_1]], implicit [[V_MAX_U32_e64_1]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_UMAX %0, %1
+    %4:vgpr(s32) = G_UMIN %0, %1
+    %5:vgpr(s32) = G_UMAX %4, %2
+    %6:vgpr(s32) = G_UMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
new file mode 100644
index 0000000000000..c323883ff6139
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
@@ -0,0 +1,168 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s
+
+---
+name: umed3_s16_vvv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: umed3_s16_vvv
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]]
+    ; GFX9-LABEL: name: umed3_s16_vvv
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MED3_U16_:%[0-9]+]]:vgpr_32 = V_MED3_U16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MED3_U16_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_UMAX %3, %4
+    %7:vgpr(s16) = G_UMIN %3, %4
+    %8:vgpr(s16) = G_UMAX %7, %5
+    %9:vgpr(s16) = G_UMIN %6, %8
+
+    S_ENDPGM 0, implicit %9
+...
+
+---
+name: umed3_s16_vvv_multiuse0
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: umed3_s16_vvv_multiuse0
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_]]
+    ; GFX9-LABEL: name: umed3_s16_vvv_multiuse0
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_UMAX %3, %4
+    %7:vgpr(s16) = G_UMIN %3, %4
+    %8:vgpr(s16) = G_UMAX %7, %5
+    %9:vgpr(s16) = G_UMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %6
+...
+
+---
+name: umed3_s16_vvv_multiuse1
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: umed3_s16_vvv_multiuse1
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MIN_U16_e64_]]
+    ; GFX9-LABEL: name: umed3_s16_vvv_multiuse1
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MIN_U16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_UMAX %3, %4
+    %7:vgpr(s16) = G_UMIN %3, %4
+    %8:vgpr(s16) = G_UMAX %7, %5
+    %9:vgpr(s16) = G_UMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %7
+...
+
+---
+name: umed3_s16_vvv_multiuse2
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: umed3_s16_vvv_multiuse2
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_1]]
+    ; GFX9-LABEL: name: umed3_s16_vvv_multiuse2
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_1]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_UMAX %3, %4
+    %7:vgpr(s16) = G_UMIN %3, %4
+    %8:vgpr(s16) = G_UMAX %7, %5
+    %9:vgpr(s16) = G_UMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %8
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
index 4321e4e7ca214..3085bb7201513 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
@@ -168,14 +168,12 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX8: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHLREV_B16_e64_]], 0, 16, implicit $exec
-    ; GFX8: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX8: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
     ; GFX9-LABEL: name: shl_s16_s16_vv_zext_to_s32
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX9: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHLREV_B16_e64_]], 0, 16, implicit $exec
-    ; GFX9: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX9: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
     ; GFX10-LABEL: name: shl_s16_s16_vv_zext_to_s32
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
index dd4f892ebc231..ce71a89adacb7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -27,7 +27,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -44,7 +44,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -56,7 +56,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -78,7 +78,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -91,7 +91,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -116,7 +116,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -131,7 +131,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -211,7 +211,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -240,7 +240,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -253,7 +253,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -275,7 +275,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -288,7 +288,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -310,7 +310,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16
   ; PACKED: bb.1 (%ir-block.0):
@@ -323,7 +323,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -346,7 +346,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -359,7 +359,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -384,7 +384,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -399,7 +399,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; PACKED:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -445,7 +445,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -485,7 +485,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
index 75d25b0c2c469..aea37fd08b408 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -31,7 +31,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -51,7 +51,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -72,7 +72,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -94,7 +94,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -132,7 +132,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -159,7 +159,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -179,7 +179,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -199,7 +199,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -220,7 +220,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -243,7 +243,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %16:vgpr_32, dead %17:sreg_64 = V_ADD_I32_e64 [[COPY6]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -286,7 +286,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index 4db5fe081fda2..c5aa36df8675d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -71,7 +71,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -103,7 +103,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -148,7 +148,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1)
   ret void
@@ -191,7 +191,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
   ret void
@@ -209,7 +209,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3)
   ret void
@@ -227,7 +227,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4)
   ret void
@@ -245,7 +245,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6)
   ret void
@@ -263,7 +263,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5)
   ret void
@@ -281,7 +281,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7)
   ret void
@@ -301,7 +301,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -322,7 +322,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -344,7 +344,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -362,7 +362,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4)
+  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i8
   call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -381,7 +381,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i16
   call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -400,7 +400,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -418,7 +418,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -438,7 +438,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -474,7 +474,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -498,7 +498,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -516,7 +516,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0)
   ret void
@@ -534,7 +534,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -553,7 +553,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -574,7 +574,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -593,7 +593,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -611,7 +611,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -629,7 +629,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -648,7 +648,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -669,7 +669,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -707,7 +707,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -750,7 +750,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
index 44a17012237ec..d4a3f4025b378 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
@@ -18,7 +18,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -44,7 +44,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -98,7 +98,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -160,7 +160,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
index c59372a8d09c7..e5d67a3f88742 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
@@ -23,7 +23,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY13]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -54,7 +54,7 @@ define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[COPY12]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY14:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY14]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -113,7 +113,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -169,7 +169,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -247,7 +247,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV5]](s64), [[UV5]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
index 33a8e9a1284cc..4443daba2ee2d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
@@ -14,7 +14,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -34,7 +34,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -72,7 +72,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -108,7 +108,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %9(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY5]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -156,7 +156,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
index 9bc81aecc8a1d..a657488278b04 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
@@ -15,7 +15,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -37,7 +37,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -76,7 +76,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -113,7 +113,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY6]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -162,7 +162,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
index efe81eabc3497..a6ba559382f5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
@@ -16,7 +16,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -39,7 +39,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[COPY6]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -76,7 +76,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -111,7 +111,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY7]](s32), implicit $exec
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -158,7 +158,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY7]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index f96a13878ba6a..ce62e041aa67e 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -10,218 +10,218 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
   ; GCN:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
   ; GCN:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM killed [[REG_SEQUENCE]], 0, 0, 0 :: (dereferenceable invariant load 16 from %ir.arg0, addrspace 6)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; GCN:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 32, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 32, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 48, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 48, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 64, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 64, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 80, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 80, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
   ; GCN:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 96, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 96, align 1, addrspace 4)
   ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
   ; GCN:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF1]].sub0
   ; GCN:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]].sub0
   ; GCN:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7 + 112, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7" + 112, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0
   ; GCN:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88
   ; GCN:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub0
   ; GCN:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176
   ; GCN:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF6]].sub0
   ; GCN:   [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[DEF7]].sub0
   ; GCN:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112
   ; GCN:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224
   ; GCN:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120
   ; GCN:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240
   ; GCN:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
   ; GCN:   [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256
   ; GCN:   [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[DEF9]].sub0
   ; GCN:   [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136
   ; GCN:   [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF10:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[DEF10]].sub0
   ; GCN:   [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272
   ; GCN:   [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF11:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[DEF11]].sub0
   ; GCN:   [[DEF12:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[DEF12]].sub0
   ; GCN:   [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY40:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF13:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[DEF13]].sub0
   ; GCN:   [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY42:%[0-9]+]]:vgpr_32 = COPY [[DEF14]].sub0
   ; GCN:   [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
   ; GCN:   [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288
   ; GCN:   [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY48:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152
   ; GCN:   [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304
   ; GCN:   [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 bb.0:
   %tmp0 = load <4 x i32>, <4 x i32> addrspace(6)* %arg0, align 16, !invariant.load !0
diff --git a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
index 0c264251942a4..93322c7da4f86 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
@@ -12,7 +12,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
   ; GCN:   [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
   ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
   ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
@@ -21,7 +21,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; GCN:   [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 main_body:
   %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index cdcf7383afc09..068e3d98f17be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -27,7 +27,7 @@
 
 ; MIR-LABEL: name: gws_barrier_offset0{{$}}
 ; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
-; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom GWSResource)
+; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom "GWSResource")
 ; MIR-NEXT: S_WAITCNT 0
 ; MIR-NEXT: }
 define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
diff --git a/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll b/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
index f70e9f378f3b0..e7e4d2bf26f2b 100644
--- a/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
+++ b/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
@@ -43,4 +43,4 @@ land.end:                                         ; preds = %land.rhs, %entry
   ret void
 }
 
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m4" "target-features"="+armv7e-m,+dsp,+fp16,+fpregs,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp,-aes,-crc,-crypto,-dotprod,-fp16fml,-fullfp16,-hwdiv-arm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m4" "target-features"="+armv7e-m,+dsp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp,-aes,-crc,-crypto,-dotprod,-fp16fml,-fullfp16,-hwdiv-arm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
index 1d85f4f9680ae..14bace2f95f8b 100644
--- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll
+++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
@@ -5,19 +5,16 @@
 define <4 x double> @fneg_fdiv_splat(double %a0, <4 x double> %a1) {
 ; CHECK-LABEL: fneg_fdiv_splat:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
 ; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxspltd 0, 1, 0
-; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
 ; CHECK-NEXT:    addi 3, 3, .LCPI0_0@toc@l
 ; CHECK-NEXT:    lxvd2x 1, 0, 3
-; CHECK-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
 ; CHECK-NEXT:    xvredp 2, 0
-; CHECK-NEXT:    addi 3, 3, .LCPI0_1@toc@l
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xvnmsubadp 1, 0, 2
-; CHECK-NEXT:    xvmaddadp 2, 2, 1
-; CHECK-NEXT:    lxvd2x 1, 0, 3
 ; CHECK-NEXT:    xxswapd 1, 1
+; CHECK-NEXT:    xxlor 3, 1, 1
+; CHECK-NEXT:    xvmaddadp 3, 0, 2
+; CHECK-NEXT:    xvnmsubadp 2, 2, 3
 ; CHECK-NEXT:    xvmaddadp 1, 0, 2
 ; CHECK-NEXT:    xvmsubadp 2, 2, 1
 ; CHECK-NEXT:    xvmuldp 34, 34, 2
diff --git a/llvm/test/CodeGen/PowerPC/fma-combine.ll b/llvm/test/CodeGen/PowerPC/fma-combine.ll
index 5baf663481d70..88da295201fea 100644
--- a/llvm/test/CodeGen/PowerPC/fma-combine.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-combine.ll
@@ -8,14 +8,12 @@
 define double @fma_combine1(double %a, double %b, double %c) {
 ; CHECK-FAST-LABEL: fma_combine1:
 ; CHECK-FAST:       # %bb.0: # %entry
-; CHECK-FAST-NEXT:    xsnegdp 0, 3
-; CHECK-FAST-NEXT:    xsmsubadp 1, 0, 2
+; CHECK-FAST-NEXT:    xsnmaddadp 1, 3, 2
 ; CHECK-FAST-NEXT:    blr
 ;
 ; CHECK-FAST-NOVSX-LABEL: fma_combine1:
 ; CHECK-FAST-NOVSX:       # %bb.0: # %entry
-; CHECK-FAST-NOVSX-NEXT:    fneg 0, 3
-; CHECK-FAST-NOVSX-NEXT:    fmsub 1, 0, 2, 1
+; CHECK-FAST-NOVSX-NEXT:    fnmadd 1, 3, 2, 1
 ; CHECK-FAST-NOVSX-NEXT:    blr
 ;
 ; CHECK-LABEL: fma_combine1:
@@ -34,14 +32,12 @@ entry:
 define double @fma_combine2(double %a, double %b, double %c) {
 ; CHECK-FAST-LABEL: fma_combine2:
 ; CHECK-FAST:       # %bb.0: # %entry
-; CHECK-FAST-NEXT:    xsnegdp 0, 3
-; CHECK-FAST-NEXT:    xsmsubadp 1, 2, 0
+; CHECK-FAST-NEXT:    xsnmaddadp 1, 2, 3
 ; CHECK-FAST-NEXT:    blr
 ;
 ; CHECK-FAST-NOVSX-LABEL: fma_combine2:
 ; CHECK-FAST-NOVSX:       # %bb.0: # %entry
-; CHECK-FAST-NOVSX-NEXT:    fneg 0, 3
-; CHECK-FAST-NOVSX-NEXT:    fmsub 1, 2, 0, 1
+; CHECK-FAST-NOVSX-NEXT:    fnmadd 1, 2, 3, 1
 ; CHECK-FAST-NOVSX-NEXT:    blr
 ;
 ; CHECK-LABEL: fma_combine2:
@@ -62,25 +58,25 @@ entry:
 define double @fma_combine_two_uses(double %a, double %b, double %c) {
 ; CHECK-FAST-LABEL: fma_combine_two_uses:
 ; CHECK-FAST:       # %bb.0: # %entry
-; CHECK-FAST-NEXT:    xsnegdp 0, 3
+; CHECK-FAST-NEXT:    xsnegdp 0, 1
 ; CHECK-FAST-NEXT:    addis 3, 2, v@toc@ha
 ; CHECK-FAST-NEXT:    addis 4, 2, z@toc@ha
-; CHECK-FAST-NEXT:    xsnegdp 3, 1
-; CHECK-FAST-NEXT:    xsmsubadp 1, 0, 2
-; CHECK-FAST-NEXT:    stfd 0, z@toc@l(4)
-; CHECK-FAST-NEXT:    stfd 3, v@toc@l(3)
+; CHECK-FAST-NEXT:    xsnmaddadp 1, 3, 2
+; CHECK-FAST-NEXT:    xsnegdp 2, 3
+; CHECK-FAST-NEXT:    stfd 0, v@toc@l(3)
+; CHECK-FAST-NEXT:    stfd 2, z@toc@l(4)
 ; CHECK-FAST-NEXT:    blr
 ;
 ; CHECK-FAST-NOVSX-LABEL: fma_combine_two_uses:
 ; CHECK-FAST-NOVSX:       # %bb.0: # %entry
-; CHECK-FAST-NOVSX-NEXT:    fneg 3, 3
+; CHECK-FAST-NOVSX-NEXT:    fnmadd 0, 3, 2, 1
+; CHECK-FAST-NOVSX-NEXT:    fneg 2, 1
 ; CHECK-FAST-NOVSX-NEXT:    addis 3, 2, v@toc@ha
 ; CHECK-FAST-NOVSX-NEXT:    addis 4, 2, z@toc@ha
-; CHECK-FAST-NOVSX-NEXT:    fmsub 0, 3, 2, 1
-; CHECK-FAST-NOVSX-NEXT:    fneg 2, 1
-; CHECK-FAST-NOVSX-NEXT:    stfd 3, z@toc@l(4)
+; CHECK-FAST-NOVSX-NEXT:    fneg 3, 3
 ; CHECK-FAST-NOVSX-NEXT:    fmr 1, 0
 ; CHECK-FAST-NOVSX-NEXT:    stfd 2, v@toc@l(3)
+; CHECK-FAST-NOVSX-NEXT:    stfd 3, z@toc@l(4)
 ; CHECK-FAST-NOVSX-NEXT:    blr
 ;
 ; CHECK-LABEL: fma_combine_two_uses:
@@ -108,19 +104,17 @@ entry:
 define double @fma_combine_one_use(double %a, double %b, double %c) {
 ; CHECK-FAST-LABEL: fma_combine_one_use:
 ; CHECK-FAST:       # %bb.0: # %entry
-; CHECK-FAST-NEXT:    xsnegdp 0, 3
+; CHECK-FAST-NEXT:    xsnegdp 0, 1
 ; CHECK-FAST-NEXT:    addis 3, 2, v@toc@ha
-; CHECK-FAST-NEXT:    xsnegdp 3, 1
-; CHECK-FAST-NEXT:    xsmsubadp 1, 0, 2
-; CHECK-FAST-NEXT:    stfd 3, v@toc@l(3)
+; CHECK-FAST-NEXT:    xsnmaddadp 1, 3, 2
+; CHECK-FAST-NEXT:    stfd 0, v@toc@l(3)
 ; CHECK-FAST-NEXT:    blr
 ;
 ; CHECK-FAST-NOVSX-LABEL: fma_combine_one_use:
 ; CHECK-FAST-NOVSX:       # %bb.0: # %entry
-; CHECK-FAST-NOVSX-NEXT:    fneg 0, 3
-; CHECK-FAST-NOVSX-NEXT:    addis 3, 2, v@toc@ha
-; CHECK-FAST-NOVSX-NEXT:    fmsub 0, 0, 2, 1
+; CHECK-FAST-NOVSX-NEXT:    fnmadd 0, 3, 2, 1
 ; CHECK-FAST-NOVSX-NEXT:    fneg 2, 1
+; CHECK-FAST-NOVSX-NEXT:    addis 3, 2, v@toc@ha
 ; CHECK-FAST-NOVSX-NEXT:    fmr 1, 0
 ; CHECK-FAST-NOVSX-NEXT:    stfd 2, v@toc@l(3)
 ; CHECK-FAST-NOVSX-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir b/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir
index f2e576ed73b63..410f688204c31 100644
--- a/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir
+++ b/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir
@@ -118,7 +118,7 @@ body: |
     %0:g8rc = COPY $x3
     %1:gprc = COPY %0.sub_32:g8rc
     %2:gprc = RLWINM %1:gprc, 27, 5, 10
-    ; CHECK: %2:gprc = RLWINM %1, 27, 5, 10
+    ; CHECK-NOT: RLWINM %1,
     %3:gprc = RLWINM %2:gprc, 8, 5, 10
     ; CHECK: %3:gprc = LI 0
     BLR8 implicit $lr8, implicit $rm
@@ -133,9 +133,24 @@ body: |
     %0:g8rc = COPY $x3
     %1:gprc = COPY %0.sub_32:g8rc
     %2:gprc = RLWINM %1:gprc, 27, 5, 10
-    ; CHECK: %2:gprc = RLWINM %1, 27, 5, 10
+    ; CHECK-NOT: RLWINM %1,
     %3:gprc = RLWINM_rec %2:gprc, 8, 5, 10, implicit-def $cr0
-    ; CHECK: %3:gprc = ANDI_rec %2, 0, implicit-def $cr0
+    ; CHECK: %3:gprc = ANDI_rec %1, 0, implicit-def $cr0
+    BLR8 implicit $lr8, implicit $rm
+...
+---
+name: testFoldRLWINMoToZeroSrcCanNotBeDeleted
+#CHECK : name : testFoldRLWINMoToZeroSrcCanNotBeDeleted
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM_rec %1:gprc, 27, 5, 10, implicit-def $cr0
+    ; CHECK: %2:gprc = RLWINM_rec %1, 27, 5, 10, implicit-def $cr0
+    %3:gprc = RLWINM_rec %2:gprc, 8, 5, 10, implicit-def $cr0
+    ; CHECK: %3:gprc = ANDI_rec %1, 0, implicit-def $cr0
     BLR8 implicit $lr8, implicit $rm
 ...
 ---
diff --git a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll
index 3bfd92a2e5b36..246bec1918ef2 100644
--- a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll
+++ b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll
@@ -229,8 +229,8 @@ define <4 x double> @foo2_fmf(<4 x double> %a, <4 x double> %b) nounwind {
 ; CHECK-NEXT:    qvfre 3, 2
 ; CHECK-NEXT:    addi 3, 3, .LCPI8_0@toc@l
 ; CHECK-NEXT:    qvlfdx 0, 0, 3
-; CHECK-NEXT:    qvfnmsub 0, 2, 3, 0
-; CHECK-NEXT:    qvfmadd 0, 3, 0, 3
+; CHECK-NEXT:    qvfmadd 0, 2, 3, 0
+; CHECK-NEXT:    qvfnmsub 0, 3, 0, 3
 ; CHECK-NEXT:    qvfmul 3, 1, 0
 ; CHECK-NEXT:    qvfnmsub 1, 2, 3, 1
 ; CHECK-NEXT:    qvfmadd 1, 0, 1, 3
diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll
index bc33617662e47..a0afb4b6e12dc 100644
--- a/llvm/test/CodeGen/PowerPC/recipest.ll
+++ b/llvm/test/CodeGen/PowerPC/recipest.ll
@@ -194,8 +194,8 @@ define <4 x float> @hoo_safe(<4 x float> %a, <4 x float> %b) nounwind {
 define double @foo2_fmf(double %a, double %b) nounwind {
 ; CHECK: @foo2_fmf
 ; CHECK-DAG: fre
-; CHECK-DAG: fnmsub
-; CHECK: fmadd
+; CHECK-DAG: fmadd
+; CHECK: fnmsub
 ; CHECK-NEXT: fmul
 ; CHECK-NEXT: fnmsub
 ; CHECK-NEXT: fmadd
diff --git a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
index 9ab320cd1eacf..2cdf832838a8d 100644
--- a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
+++ b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
@@ -13,9 +13,9 @@ define <4 x float> @repeated_fp_divisor(float %a, <4 x float> %b) {
 ; CHECK-NEXT:    lvx 4, 0, 3
 ; CHECK-NEXT:    xxspltw 0, 0, 0
 ; CHECK-NEXT:    xvresp 1, 0
-; CHECK-NEXT:    xvnmsubasp 35, 0, 1
+; CHECK-NEXT:    xvmaddasp 35, 0, 1
 ; CHECK-NEXT:    xvmulsp 0, 34, 36
-; CHECK-NEXT:    xvmaddasp 1, 1, 35
+; CHECK-NEXT:    xvnmsubasp 1, 1, 35
 ; CHECK-NEXT:    xvmulsp 34, 0, 1
 ; CHECK-NEXT:    blr
   %ins = insertelement <4 x float> undef, float %a, i32 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
index 976c5f5d7ba36..6dd8caafc33e7 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
@@ -60,7 +60,7 @@
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
   declare void @llvm.stackprotector(i8*, i8**) #5
 
-  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
   attributes #1 = { noduplicate nounwind }
   attributes #2 = { nounwind readnone }
   attributes #3 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
index 9e429040db4fd..d49a1e86109b8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
@@ -62,7 +62,7 @@
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
   declare void @llvm.stackprotector(i8*, i8**) #5
 
-  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
   attributes #1 = { noduplicate nounwind }
   attributes #2 = { nounwind readnone }
   attributes #3 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
index ab7fcf843d7dc..bf1c40fb34e5f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
@@ -62,7 +62,7 @@
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
   declare void @llvm.stackprotector(i8*, i8**) #5
 
-  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
   attributes #1 = { noduplicate nounwind }
   attributes #2 = { nounwind readnone }
   attributes #3 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index ddf51b785ff0d..e25e0298eb243 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16 -disable-mve-tail-predication=false %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 -disable-mve-tail-predication=false %s -o - | FileCheck %s
 
 define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocapture readonly %b, float* nocapture readonly %c, i32 %N) {
 ; CHECK-LABEL: fast_float_mul:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index ebb041d937224..b152191798bcf 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 %s -o - | FileCheck %s
 
 define arm_aapcs_vfpcc void @float_float_mul(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) {
 ; CHECK-LABEL: float_float_mul:
diff --git a/llvm/test/CodeGen/VE/lit.local.cfg b/llvm/test/CodeGen/VE/lit.local.cfg
new file mode 100644
index 0000000000000..b6366779272df
--- /dev/null
+++ b/llvm/test/CodeGen/VE/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'VE' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/CodeGen/VE/target_support.ll b/llvm/test/CodeGen/VE/target_support.ll
new file mode 100644
index 0000000000000..336d9cd367208
--- /dev/null
+++ b/llvm/test/CodeGen/VE/target_support.ll
@@ -0,0 +1,2 @@
+; RUN: llc --version | FileCheck %s
+; CHECK:    ve     - VE
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index a5fd84c32ed51..e4a5d1392c0bf 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -462,14 +462,12 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpavgb 32(%rsi), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpavgb 32(%rsi), %xmm2, %xmm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, (%rax)
-; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT:    vmovdqu %xmm2, (%rax)
 ; AVX512BW-NEXT:    retq
   %1 = load <48 x i8>, <48 x i8>* %a
   %2 = load <48 x i8>, <48 x i8>* %b
diff --git a/llvm/test/CodeGen/X86/copy-eflags-liveinlists.mir b/llvm/test/CodeGen/X86/copy-eflags-liveinlists.mir
new file mode 100644
index 0000000000000..54454fe0017f1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/copy-eflags-liveinlists.mir
@@ -0,0 +1,92 @@
+# RUN: llc -mtriple=i686-unknown-unknown -run-pass=x86-flags-copy-lowering \
+# RUN:  -print-after=x86-flags-copy-lowering %s -o - | FileCheck %s
+#
+# Check that $eflags is removed from live-in lists of successor blocks.
+#
+# CHECK-NOT: liveins: $eflags
+
+--- |
+  define void @fun(i16 %arg, i64 %arg1, i8 %arg2, i8* %arg3, i32 %arg4) { ret void}
+...
+---
+name:            fun
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr16 }
+  - { id: 1, class: gr16 }
+  - { id: 2, class: gr16 }
+  - { id: 3, class: gr32 }
+  - { id: 4, class: gr32 }
+  - { id: 5, class: gr8 }
+  - { id: 6, class: gr32 }
+  - { id: 7, class: gr32 }
+  - { id: 8, class: gr32 }
+  - { id: 9, class: gr32 }
+  - { id: 10, class: gr32 }
+  - { id: 11, class: gr32 }
+  - { id: 12, class: gr32 }
+  - { id: 13, class: gr8 }
+  - { id: 14, class: gr32 }
+  - { id: 15, class: gr32 }
+  - { id: 16, class: gr32_abcd }
+  - { id: 17, class: gr8 }
+  - { id: 18, class: gr8 }
+  - { id: 19, class: gr32 }
+  - { id: 20, class: gr32 }
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  - { id: 0, offset: 20, size: 4, alignment: 4, isImmutable: true }
+  - { id: 1, offset: 16, size: 4, alignment: 4, isImmutable: true }
+  - { id: 2, offset: 12, size: 1, alignment: 4, isImmutable: true }
+  - { id: 3, offset: 8, size: 4, alignment: 4, isImmutable: true }
+  - { id: 4, offset: 4, size: 4, alignment: 4, isImmutable: true }
+  - { id: 5, size: 2, alignment: 4, isImmutable: true }
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    %4:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.3)
+    %3:gr32 = MOV32rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.4)
+    %7:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0)
+    %6:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.1)
+    %5:gr8 = MOV8rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load 1 from %fixed-stack.2, align 4)
+    %9:gr32 = IMPLICIT_DEF
+    %11:gr32 = IMPLICIT_DEF
+  
+  bb.1:
+    successors: %bb.2, %bb.3
+  
+    CMP32rr %3, %9, implicit-def $eflags
+    %10:gr32 = SBB32rr %4, %11, implicit-def $eflags, implicit $eflags
+    %12:gr32 = COPY $eflags
+    %13:gr8 = SETCCr 12, implicit $eflags
+    %14:gr32 = MOVZX32rr8 killed %13
+    %15:gr32 = NEG32r %14, implicit-def dead $eflags
+    %16:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    $eflags = COPY %12
+    %17:gr8 = COPY %16.sub_8bit
+    JCC_1 %bb.3, 12, implicit $eflags
+  
+  bb.2:
+    liveins: $eflags
+  
+  
+  bb.3:
+    successors: %bb.4, %bb.5
+    liveins: $eflags
+  
+    %18:gr8 = PHI %5, %bb.2, %17, %bb.1
+    MOV8mr %6, 1, $noreg, 0, $noreg, killed %18 :: (volatile store 1 into %ir.arg3)
+    JCC_1 %bb.5, 12, implicit $eflags
+  
+  bb.4:
+  
+  bb.5:
+    %19:gr32 = PHI %16, %bb.4, %15, %bb.3
+    $eax = COPY %7
+    CDQ implicit-def $eax, implicit-def $edx, implicit $eax
+    IDIV32r killed %19, implicit-def dead $eax, implicit-def $edx, implicit-def dead $eflags, implicit $eax, implicit $edx
+    JMP_1 %bb.1
+
+...
diff --git a/llvm/test/CodeGen/X86/pr34657.ll b/llvm/test/CodeGen/X86/pr34657.ll
index d8b72920fed15..9761927dc239b 100644
--- a/llvm/test/CodeGen/X86/pr34657.ll
+++ b/llvm/test/CodeGen/X86/pr34657.ll
@@ -5,13 +5,12 @@ define <112 x i8> @pr34657(<112 x i8>* %src) local_unnamed_addr {
 ; CHECK-LABEL: pr34657:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    vmovups 64(%rsi), %ymm0
-; CHECK-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm1
-; CHECK-NEXT:    vmovups (%rsi), %zmm2
-; CHECK-NEXT:    vmovaps %ymm0, 64(%rdi)
-; CHECK-NEXT:    vmovaps %zmm2, (%rdi)
-; CHECK-NEXT:    vextractf32x4 $2, %zmm1, 96(%rdi)
+; CHECK-NEXT:    vmovups (%rsi), %zmm0
+; CHECK-NEXT:    vmovups 64(%rsi), %ymm1
+; CHECK-NEXT:    vmovups 96(%rsi), %xmm2
+; CHECK-NEXT:    vmovaps %xmm2, 96(%rdi)
+; CHECK-NEXT:    vmovaps %ymm1, 64(%rdi)
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index fcdebfa68a5e7..74a83214bf208 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1055,64 +1055,24 @@ ret void
 }
 
 define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) {
-; AVX1-LABEL: interleaved_store_vf16_i8_stride3:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqu %xmm0, 16(%rdi)
-; AVX1-NEXT:    vmovdqu %xmm1, (%rdi)
-; AVX1-NEXT:    vmovdqu %xmm2, 32(%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: interleaved_store_vf16_i8_stride3:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vmovdqu %xmm0, 16(%rdi)
-; AVX2-NEXT:    vmovdqu %xmm1, (%rdi)
-; AVX2-NEXT:    vmovdqu %xmm2, 32(%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm1
-; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX512-NEXT:    vextracti32x4 $2, %zmm1, 32(%rdi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: interleaved_store_vf16_i8_stride3:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vmovdqu %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovdqu %xmm1, (%rdi)
+; AVX-NEXT:    vmovdqu %xmm2, 32(%rdi)
+; AVX-NEXT:    retq
 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
diff --git a/llvm/test/MachineVerifier/live-ins-01.mir b/llvm/test/MachineVerifier/live-ins-01.mir
new file mode 100644
index 0000000000000..51c05dacf0558
--- /dev/null
+++ b/llvm/test/MachineVerifier/live-ins-01.mir
@@ -0,0 +1,57 @@
+# RUN: not llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z14 -run-pass none 2>&1 | FileCheck %s
+# REQUIRES: systemz-registered-target
+
+# Test that a the machine verifier reports an error when a register in
+# liveins is not liveout from predecessor.
+
+---
+name:            f1
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $r2l, $r3l
+  
+    %1:gr32bit = COPY $r3l
+    %0:gr32bit = COPY $r2l
+    CHIMux %0, 0, implicit-def $cc
+
+  bb.1:
+    liveins: $cc
+
+  bb.2:
+    liveins: $cc
+
+    %2:grx32bit = LOCRMux %1, %0, 14, 8, implicit $cc
+    $r2l = COPY %2
+    Return implicit $r2l
+...
+
+# CHECK: *** Bad machine code: Live in register not found to be live out from predecessor. ***
+# CHECK:- function:    f2
+# CHECK:- basic block: %bb.2
+# CHECK:CC not found to be live out from %bb.1
+---
+name:            f2
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $r2l, $r3l
+  
+    %1:gr32bit = COPY $r3l
+    %0:gr32bit = COPY $r2l
+    CHIMux %0, 0, implicit-def $cc
+
+  bb.1:
+    liveins: $cc
+    KILL killed $cc
+
+  bb.2:
+    liveins: $cc
+
+    %2:grx32bit = LOCRMux %1, %0, 14, 8, implicit $cc
+    $r2l = COPY %2
+    Return implicit $r2l
+
+...
diff --git a/llvm/test/MachineVerifier/live-ins-02.mir b/llvm/test/MachineVerifier/live-ins-02.mir
new file mode 100644
index 0000000000000..d76325cdd1082
--- /dev/null
+++ b/llvm/test/MachineVerifier/live-ins-02.mir
@@ -0,0 +1,32 @@
+# RUN: not llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z14 -run-pass none 2>&1 | FileCheck %s
+# REQUIRES: systemz-registered-target
+
+# Test that a the machine verifier reports an error when a register in
+# liveins is not liveout from predecessor.
+
+---
+name:            f1
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $r2l, $r3l
+  
+    %1:gr32bit = COPY $r3l
+    %0:gr32bit = COPY $r2l
+    CHIMux %0, 0, implicit-def $cc
+
+  bb.1:
+
+  bb.2:
+    liveins: $cc
+
+    %2:grx32bit = LOCRMux %1, %0, 14, 8, implicit $cc
+    $r2l = COPY %2
+    Return implicit $r2l
+...
+
+# CHECK: *** Bad machine code: Live in register not found to be live out from predecessor. ***
+# CHECK:- function:    f1
+# CHECK:- basic block: %bb.2
+# CHECK:CC not found to be live out from %bb.1
diff --git a/llvm/test/MachineVerifier/live-ins-03.mir b/llvm/test/MachineVerifier/live-ins-03.mir
new file mode 100644
index 0000000000000..b5345ccdc3b63
--- /dev/null
+++ b/llvm/test/MachineVerifier/live-ins-03.mir
@@ -0,0 +1,36 @@
+# RUN: not llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z14 -run-pass none 2>&1 | FileCheck %s
+# REQUIRES: systemz-registered-target
+
+# Test that a the machine verifier reports an error when a register in
+# liveins is not liveout from predecessor.
+
+---
+name:            f1
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $r2l, $r3l
+  
+    %1:gr32bit = COPY $r3l
+    %0:gr32bit = COPY $r2l
+    CHIMux %0, 0, implicit-def $cc
+
+  bb.1:
+    liveins: $cc
+    BRC 14, 8, %bb.3, implicit $cc
+
+  bb.2:
+
+  bb.3:
+    liveins: $cc
+
+    %2:grx32bit = LOCRMux %1, %0, 14, 8, implicit $cc
+    $r2l = COPY %2
+    Return implicit $r2l
+...
+
+# CHECK: *** Bad machine code: Live in register not found to be live out from predecessor. ***
+# CHECK:- function:    f1
+# CHECK:- basic block: %bb.3
+# CHECK:CC not found to be live out from %bb.2
diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter.td
index 41c825950ca2f..59816497cc937 100644
--- a/llvm/test/TableGen/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter.td
@@ -210,7 +210,7 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; }
 // CHECK-NEXT: };
 // CHECK-NEXT: MyTargetInstructionSelector::CustomRendererFn
 // CHECK-NEXT: MyTargetInstructionSelector::CustomRenderers[] = {
-// CHECK-NEXT:   nullptr, // GICP_Invalid
+// CHECK-NEXT:   nullptr, // GICR_Invalid
 // CHECK-NEXT:   &MyTargetInstructionSelector::renderImm8, // gi_cimm8
 // CHECK-NEXT: };
 
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/crash.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/crash.ll
index 87a3ba5811e16..db3db632e5f24 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/crash.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/crash.ll
@@ -11,10 +11,6 @@ define void @zot() personality i32 (...)* @wibble {
 ; ATTRIBUTOR-NEXT:  bb:
 ; ATTRIBUTOR-NEXT:    call void @hoge()
 ; ATTRIBUTOR-NEXT:    unreachable
-; ATTRIBUTOR:       bb.split:
-; ATTRIBUTOR-NEXT:    unreachable
-; ATTRIBUTOR:       bb1.i2c:
-; ATTRIBUTOR-NEXT:    unreachable
 ; ATTRIBUTOR:       bb1:
 ; ATTRIBUTOR-NEXT:    unreachable
 ; ATTRIBUTOR:       bb2:
@@ -47,8 +43,6 @@ define internal void @hoge() {
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@hoge()
 ; ATTRIBUTOR-NEXT:  bb:
 ; ATTRIBUTOR-NEXT:    unreachable
-; ATTRIBUTOR:       bb.split:
-; ATTRIBUTOR-NEXT:    unreachable
 ;
 bb:
   %tmp = call fastcc i8* @spam(i1 (i8*)* @eggs)
@@ -77,8 +71,6 @@ define i32 @test_inf_promote_caller(i32 %arg) {
 ; CHECK-SAME: (i32 [[ARG:%.*]])
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    unreachable
-; CHECK:       bb.split:
-; CHECK-NEXT:    unreachable
 ;
 bb:
   %tmp = alloca %S
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
index 36adfe08a4d20..153ce6893ba2a 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
@@ -17,8 +17,6 @@ define void @run() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @CaptureAStruct(%struct.Foo* nofree nonnull readonly align 8 dereferenceable(16) @a)
 ; CHECK-NEXT:    unreachable
-; CHECK:       entry.split:
-; CHECK-NEXT:    unreachable
 ;
 entry:
   tail call i8 @UseLongDoubleUnsafely(%union.u* byval align 16 bitcast (%struct.s* @b to %union.u*))
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
index b81d35491f944..a5ca51e9bd996 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
 ; RUN: opt -S -basicaa -attributor -attributor-disable=false -attributor-max-iterations-verify -attributor-max-iterations=3 < %s | FileCheck %s --check-prefixes=CHECK,OLDPM_MODULE
 ; RUN: opt -S -passes='attributor' -aa-pipeline='basic-aa' -attributor-disable=false -attributor-max-iterations-verify -attributor-max-iterations=3 < %s | FileCheck %s --check-prefixes=CHECK,NEWPM_MODULE
 
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll
index 271854a224569..d08969c0a2620 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll
@@ -13,8 +13,6 @@ define i32 @bar() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = call addrspace(1) i32 @foo()
 ; CHECK-NEXT:    unreachable
-; CHECK:       entry.split:
-; CHECK-NEXT:    unreachable
 ;
 
 entry:
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/recursion.ll b/llvm/test/Transforms/Attributor/IPConstantProp/recursion.ll
index b9fd0468d380a..fc82342a989f6 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/recursion.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/recursion.ll
@@ -12,8 +12,6 @@ define internal i32 @foo(i32 %X) {
 define void @bar() {
 ; CHECK-LABEL: define {{[^@]+}}@bar()
 ; CHECK-NEXT:    unreachable
-; CHECK:       .split:
-; CHECK-NEXT:    unreachable
 ;
   call i32 @foo( i32 17 )         ; <i32>:1 [#uses=0]
   ret void
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll b/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll
index 04927726daa25..f30461c746af8 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll
@@ -8,8 +8,6 @@ define i1 @invokecaller(i1 %C) personality i32 (...)* @__gxx_personality_v0 {
 ; CHECK-SAME: (i1 [[C:%.*]]) #0 personality i32 (...)* @__gxx_personality_v0
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @foo(i1 [[C]])
 ; CHECK-NEXT:    br label [[OK:%.*]]
-; CHECK:       .i2c:
-; CHECK-NEXT:    unreachable
 ; CHECK:       OK:
 ; CHECK-NEXT:    [[Y:%.*]] = icmp ne i32 52, 0
 ; CHECK-NEXT:    ret i1 [[Y]]
diff --git a/llvm/test/Transforms/Attributor/liveness.ll b/llvm/test/Transforms/Attributor/liveness.ll
index cf9dc8b789dd3..4dd37865fcc09 100644
--- a/llvm/test/Transforms/Attributor/liveness.ll
+++ b/llvm/test/Transforms/Attributor/liveness.ll
@@ -277,6 +277,96 @@ cleanup:
   ret i32 0
 }
 
+; UTC_ARGS: --turn on
+
+; TEST 5.4 unounwind invoke instruction replaced by a call and a branch instruction put after it.
+define i32 @invoke_nounwind_phi(i32 %a) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: define {{[^@]+}}@invoke_nounwind_phi
+; CHECK-SAME: (i32 [[A:%.*]]) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    call void @normal_call()
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @foo_nounwind()
+; CHECK-NEXT:    br label [[CONTINUE:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    call void @normal_call()
+; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar()
+; CHECK-NEXT:    br label [[CONTINUE]]
+; CHECK:       continue:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, [[COND_TRUE]] ], [ 1, [[COND_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[P]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  call void @normal_call()
+  %call = invoke i32 @foo_nounwind() to label %continue
+  unwind label %cleanup
+
+cond.false:                                       ; preds = %entry
+  call void @normal_call()
+  %call1 = call i32 @bar()
+  br label %continue
+
+continue:
+  %p = phi i32 [ 0, %cond.true ], [ 1, %cond.false ]
+  ret i32 %p
+
+cleanup:
+  %res = landingpad { i8*, i32 } catch i8* null
+  ret i32 0
+}
+
+; TEST 5.5 unounwind invoke instruction replaced by a call and a branch instruction put after it.
+define i32 @invoke_nounwind_phi_dom(i32 %a) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: define {{[^@]+}}@invoke_nounwind_phi_dom
+; CHECK-SAME: (i32 [[A:%.*]]) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    call void @normal_call()
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @foo_nounwind()
+; CHECK-NEXT:    br label [[CONTINUE:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    call void @normal_call()
+; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar()
+; CHECK-NEXT:    br label [[CONTINUE]]
+; CHECK:       continue:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[CALL]], [[COND_TRUE]] ], [ [[CALL1]], [[COND_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[P]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  call void @normal_call()
+  %call = invoke i32 @foo_nounwind() to label %continue
+  unwind label %cleanup
+
+cond.false:                                       ; preds = %entry
+  call void @normal_call()
+  %call1 = call i32 @bar()
+  br label %continue
+
+continue:
+  %p = phi i32 [ %call, %cond.true ], [ %call1, %cond.false ]
+  ret i32 %p
+
+cleanup:
+  %res = landingpad { i8*, i32 } catch i8* null
+  ret i32 0
+}
+
 ; UTC_ARGS: --turn off
 
 ; TEST 6: Undefined behvior, taken from LangRef.
@@ -707,7 +797,6 @@ define internal void @dead_e2() { ret void }
 ; CHECK-NEXT: define internal void @non_dead_d15()
 ; CHECK-NOT: define internal void @dead_e
 
-
 declare void @blowup() noreturn
 define void @live_with_dead_entry() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 ; CHECK:      define void @live_with_dead_entry(
@@ -735,19 +824,19 @@ define void @live_with_dead_entry_lp() personality i8* bitcast (i32 (...)* @__gx
 ; CHECK:      define void @live_with_dead_entry_lp(
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   invoke void @blowup()
-; CHECK-NEXT:           to label %live_with_dead_entry.dead unwind label %lp1
-; CHECK:      lp1:                                              ; preds = %entry
+; CHECK-NEXT:    to label %[[LIVE_WITH_DEAD_ENTRY_DEAD1:.*]] unwind label %[[LP1:.*]]
+; CHECK:      [[LP1]]:                                              ; preds = %entry
 ; CHECK-NEXT:   %lp = landingpad { i8*, i32 }
 ; CHECK-NEXT:           catch i8* null
 ; CHECK-NEXT:   invoke void @blowup()
-; CHECK-NEXT:           to label %live_with_dead_entry.dead1 unwind label %lp2
-; CHECK:      lp2:                                              ; preds = %lp1
+; CHECK-NEXT:    to label %[[LIVE_WITH_DEAD_ENTRY_DEAD2:.*]] unwind label %[[LP2:.*]]
+; CHECK:      [[LP2]]:                                              ; preds = %lp1
 ; CHECK-NEXT:   %0 = landingpad { i8*, i32 }
 ; CHECK-NEXT:           catch i8* null
 ; CHECK-NEXT:   br label %live_with_dead_entry
-; CHECK:      live_with_dead_entry.dead:
+; CHECK:      [[LIVE_WITH_DEAD_ENTRY_DEAD1]]:
 ; CHECK-NEXT:   unreachable
-; CHECK:      live_with_dead_entry.dead1:
+; CHECK:      [[LIVE_WITH_DEAD_ENTRY_DEAD2]]:
 ; CHECK-NEXT:   unreachable
 ; CHECK:      live_with_dead_entry:                             ; preds = %lp2
 ; CHECK-NEXT:   ret void
diff --git a/llvm/test/Transforms/Attributor/noreturn_async.ll b/llvm/test/Transforms/Attributor/noreturn_async.ll
index 7c00a5a0b5cbf..9fb99159acf5d 100644
--- a/llvm/test/Transforms/Attributor/noreturn_async.ll
+++ b/llvm/test/Transforms/Attributor/noreturn_async.ll
@@ -100,9 +100,9 @@ define dso_local i32 @"?catchoverflow@@YAHXZ_may_throw"()  personality i8* bitca
 entry:
   %retval = alloca i32, align 4
   %__exception_code = alloca i32, align 4
-; CHECK: invoke void @"?overflow@@YAXXZ_may_throw"() 
+; CHECK: invoke void @"?overflow@@YAXXZ_may_throw"()
 ; CHECK:          to label %invoke.cont unwind label %catch.dispatch
-  invoke void @"?overflow@@YAXXZ_may_throw"() 
+  invoke void @"?overflow@@YAXXZ_may_throw"()
           to label %invoke.cont unwind label %catch.dispatch
 
 invoke.cont:                                      ; preds = %entry
diff --git a/llvm/test/Transforms/Attributor/undefined_behavior.ll b/llvm/test/Transforms/Attributor/undefined_behavior.ll
index e9b782452182f..fd0ddb1ebb850 100644
--- a/llvm/test/Transforms/Attributor/undefined_behavior.ll
+++ b/llvm/test/Transforms/Attributor/undefined_behavior.ll
@@ -16,6 +16,16 @@ define void @load_wholly_unreachable() {
   ret void
 }
 
+define void @loads_wholly_unreachable() {
+; ATTRIBUTOR-LABEL: @loads_wholly_unreachable(
+; ATTRIBUTOR-NEXT:    unreachable
+;
+  %a = load i32, i32* null
+  %b = load i32, i32* null
+  ret void
+}
+
+
 define void @load_single_bb_unreachable(i1 %cond) {
 ; ATTRIBUTOR-LABEL: @load_single_bb_unreachable(
 ; ATTRIBUTOR-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[E:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/fdiv.ll b/llvm/test/Transforms/InstCombine/fdiv.ll
index 8bfeb67f6e066..ec1119cb24573 100644
--- a/llvm/test/Transforms/InstCombine/fdiv.ll
+++ b/llvm/test/Transforms/InstCombine/fdiv.ll
@@ -187,17 +187,17 @@ define float @div_with_div_denominator_extra_use(float %x, float %y, float %z) {
   ret float %div2
 }
 
-; Z / (1.0 / Y)
+; Z / (1.0 / Y) ==> Y * Z
 
 define float @div_with_div_denominator_with_one_as_numerator_extra_use(float %x, float %y, float %z) {
 ; CHECK-LABEL: @div_with_div_denominator_with_one_as_numerator_extra_use(
 ; CHECK-NEXT:    [[DIV1:%.*]] = fdiv float 1.000000e+00, [[Y:%.*]]
-; CHECK-NEXT:    [[DIV2:%.*]] = fdiv fast float [[Z:%.*]], [[DIV1]]
+; CHECK-NEXT:    [[DIV2:%.*]] = fmul reassoc arcp float [[Y]], [[Z:%.*]]
 ; CHECK-NEXT:    call void @use_f32(float [[DIV1]])
 ; CHECK-NEXT:    ret float [[DIV2]]
 ;
   %div1 = fdiv float 1.0, %y
-  %div2 = fdiv fast float %z, %div1
+  %div2 = fdiv reassoc arcp float %z, %div1
   call void @use_f32(float %div1)
   ret float %div2
 }
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 5581a3bd74719..4fe499ad4a24e 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -3,8 +3,7 @@
 
 define i1 @bool_true_or_false(i1 %cond) {
 ; CHECK-LABEL: @bool_true_or_false(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], i1 true, i1 false
-; CHECK-NEXT:    ret i1 [[S]]
+; CHECK-NEXT:    ret i1 [[COND:%.*]]
 ;
   %s = select i1 %cond, i1 true, i1 false
   ret i1 %s
@@ -12,8 +11,7 @@ define i1 @bool_true_or_false(i1 %cond) {
 
 define <2 x i1> @bool_true_or_false_vec(<2 x i1> %cond) {
 ; CHECK-LABEL: @bool_true_or_false_vec(
-; CHECK-NEXT:    [[S:%.*]] = select <2 x i1> [[COND:%.*]], <2 x i1> <i1 true, i1 true>, <2 x i1> zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[S]]
+; CHECK-NEXT:    ret <2 x i1> [[COND:%.*]]
 ;
   %s = select <2 x i1> %cond, <2 x i1> <i1 true, i1 true>, <2 x i1> zeroinitializer
   ret <2 x i1> %s
@@ -21,8 +19,7 @@ define <2 x i1> @bool_true_or_false_vec(<2 x i1> %cond) {
 
 define <2 x i1> @bool_true_or_false_vec_undef(<2 x i1> %cond) {
 ; CHECK-LABEL: @bool_true_or_false_vec_undef(
-; CHECK-NEXT:    [[S:%.*]] = select <2 x i1> [[COND:%.*]], <2 x i1> <i1 undef, i1 true>, <2 x i1> <i1 false, i1 undef>
-; CHECK-NEXT:    ret <2 x i1> [[S]]
+; CHECK-NEXT:    ret <2 x i1> [[COND:%.*]]
 ;
   %s = select <2 x i1> %cond, <2 x i1> <i1 undef, i1 true>, <2 x i1> <i1 false, i1 undef>
   ret <2 x i1> %s
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll b/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll
index 2067969a994ff..15016c1f7e7d9 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll
@@ -121,6 +121,55 @@ for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.
   ret void
 }
 
+; Test that we don't unroll loops that only contain vector intrinsics.
+; CHECK-LABEL: test_intrinsics
+; CHECK: call <16 x i8> @llvm.arm.mve.sub
+; CHECK-NOT: call <16 x i8> @llvm.arm.mve.sub
+define dso_local arm_aapcs_vfpcc void @test_intrinsics(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  %tmp8 = add i32 %N, 15
+  %tmp9 = lshr i32 %tmp8, 4
+  %tmp10 = shl nuw i32 %tmp9, 4
+  %tmp11 = add i32 %tmp10, -16
+  %tmp12 = lshr i32 %tmp11, 4
+  %tmp13 = add nuw nsw i32 %tmp12, 1
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %0 = phi i32 [ %N, %vector.ph ], [ %2, %vector.body ]
+  %tmp = getelementptr inbounds i8, i8* %a, i32 %index
+  %1 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %0)
+  %2 = sub i32 %0, 16
+  %tmp2 = bitcast i8* %tmp to <16 x i8>*
+  %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %1, <16 x i8> undef)
+  %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
+  %tmp4 = bitcast i8* %tmp3 to <16 x i8>*
+  %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %1, <16 x i8> undef)
+  %sub = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> %wide.masked.load2, <16 x i8> %wide.masked.load, <16 x i1> %1, <16 x i8> undef)
+  %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index
+  %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
+  tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %sub, <16 x i8>* %tmp7, i32 4, <16 x i1> %1)
+  %index.next = add i32 %index, 16
+  %tmp15 = sub i32 %tmp14, 1
+  %tmp16 = icmp ne i32 %tmp15, 0
+  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+declare <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>)
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
+
+
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.isvectorized", i32 1}
 !2 = distinct !{!2, !3, !1}
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll
index 890683043b4bf..d584238745cfe 100644
--- a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll
+++ b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -basicaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/disable.ll b/llvm/test/Transforms/LoopUnrollAndJam/disable.ll
index 4a00937b9c58c..6e879896c55fc 100644
--- a/llvm/test/Transforms/LoopUnrollAndJam/disable.ll
+++ b/llvm/test/Transforms/LoopUnrollAndJam/disable.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -pass-remarks=loop-unroll-and-jam < %s -S 2>&1 | FileCheck %s
+; RUN: opt -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 -pass-remarks=loop-unroll-and-jam < %s -S 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll b/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll
index 5254c779d0f87..f2c12702f9e89 100644
--- a/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll
+++ b/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime -unroll-partial-threshold=60 < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-runtime -unroll-partial-threshold=60 < %s -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
index bdb47c27f1cfc..7580b50f8703b 100644
--- a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
+++ b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -basicaa -tbaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -unroll-remainder < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 -unroll-remainder < %s -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
index f005845f2db85..7195e61c280fc 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
@@ -84,4 +84,4 @@ exit:
   ret void
 }
 
-attributes #0 = { nounwind "min-legal-vector-width"="0" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "use-soft-float"="false" }
+attributes #0 = { nounwind "min-legal-vector-width"="0" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll
index cb6e1005db1c2..353e725580a4a 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll
@@ -84,4 +84,4 @@ while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
-attributes #0 = { "target-features"="+armv8.1-m.main,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-crypto,-d32,-fp-armv8,-fp-armv8sp,-neon,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
+attributes #0 = { "target-features"="+armv8.1-m.main,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-crypto,-d32,-fp-armv8,-fp-armv8sp,-neon,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
new file mode 100644
index 0000000000000..5a3438230a2d5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -disable-mve-tail-predication=false -S | FileCheck %s
+
+; Check that when we can't predicate this loop that it is still vectorised (with
+; an epilogue).
+; TODO: the reason this can't be predicated is because a primary induction
+; variable can't be found (not yet) for this counting down loop. But with that
+; fixed, this should be able to be predicated.
+
+; CHECK-LABEL: vector.body:
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-unknown-eabihf"
+
+define dso_local void @foo(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i8* noalias nocapture %C, i32 %N) #0 {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+  %C.addr.09 = phi i8* [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ]
+  %B.addr.08 = phi i8* [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ]
+  %A.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %A.addr.07, i32 1
+  %0 = load i8, i8* %A.addr.07, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+  %1 = load i8, i8* %B.addr.08, align 1
+  %add = add i8 %1, %0
+  %incdec.ptr4 = getelementptr inbounds i8, i8* %C.addr.09, i32 1
+  store i8 %add, i8* %C.addr.09, align 1
+  %dec = add i32 %N.addr.010, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  ret void
+}
+
+attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
new file mode 100644
index 0000000000000..2667bfe68f616
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+
+; Check that when we can't predicate this loop that it is still vectorised (with
+; an epilogue).
+; TODO: the reason this can't be predicated is because a primary induction
+; variable can't be found (not yet) for this counting down loop. But with that
+; fixed, this should be able to be predicated.
+
+; CHECK-LABEL: vector.body:
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define dso_local void @foo(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i8* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+  %C.addr.09 = phi i8* [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ]
+  %B.addr.08 = phi i8* [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ]
+  %A.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %A.addr.07, i32 1
+  %0 = load i8, i8* %A.addr.07, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+  %1 = load i8, i8* %B.addr.08, align 1
+  %add = add i8 %1, %0
+  %incdec.ptr4 = getelementptr inbounds i8, i8* %C.addr.09, i32 1
+  store i8 %add, i8* %C.addr.09, align 1
+  %dec = add i32 %N.addr.010, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
index 246f4e42c2fa7..b72ecf5c0e4c0 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
@@ -6,59 +6,63 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-LABEL: @transpose_multiply(
 ; CHECK-NEXT:  entry:
 
-; Load input matrixes %A and %B.
-
-; CHECK-NEXT:    [[A:%.*]] = load <9 x double>, <9 x double>* [[A_PTR:%.*]]
-; CHECK-NEXT:    [[B:%.*]] = load <9 x double>, <9 x double>* [[B_PTR:%.*]]
-
-; Extract columns from loaded value %A.
-
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; Load columns of input matrixes %A and %B.
+
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST]], align 8
+; CHECK-NEXT:    [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 3
+; CHECK-NEXT:    [[COL_CAST1:%.*]] = bitcast double* [[COL_GEP]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST1]], align 8
+; CHECK-NEXT:    [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 6
+; CHECK-NEXT:    [[COL_CAST4:%.*]] = bitcast double* [[COL_GEP3]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST4]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST6]], align 8
+; CHECK-NEXT:    [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i32 3
+; CHECK-NEXT:    [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP8]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST9]], align 8
+; CHECK-NEXT:    [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i32 6
+; CHECK-NEXT:    [[COL_CAST12:%.*]] = bitcast double* [[COL_GEP11]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST12]], align 8
 
 ; Transpose %A.
 
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x double> undef, double [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x double> [[TMP1]], double [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x double> undef, double [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x double> [[TMP7]], double [[TMP8]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <3 x double> undef, double [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <3 x double> [[TMP13]], double [[TMP14]], i64 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 2
 
-; Extract columns from %B.
-
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
-
 ; Lower multiply(transpose(%A), %B)
 
 ; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[BLOCK6:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT7:%.*]] = insertelement <1 x double> undef, double [[TMP20]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT8:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT7]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <1 x double> [[BLOCK6]], [[SPLAT_SPLAT8]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = fadd <1 x double> [[TMP19]], [[TMP21]]
 ; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> undef, double [[TMP23]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fmul <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
@@ -66,18 +70,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x double> [[TMP25]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP26]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> undef, double [[TMP28]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP29:%.*]] = fmul <1 x double> [[BLOCK12]], [[SPLAT_SPLAT14]]
 ; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> undef, double [[TMP30]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = fadd <1 x double> [[TMP29]], [[TMP31]]
 ; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> undef, double [[TMP33]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = fmul <1 x double> [[BLOCK18]], [[SPLAT_SPLAT20]]
@@ -85,18 +89,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <1 x double> [[TMP35]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <3 x double> [[TMP27]], <3 x double> [[TMP36]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT22:%.*]] = insertelement <1 x double> undef, double [[TMP38]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT23:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT22]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP39:%.*]] = fmul <1 x double> [[BLOCK21]], [[SPLAT_SPLAT23]]
 ; CHECK-NEXT:    [[BLOCK24:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT25:%.*]] = insertelement <1 x double> undef, double [[TMP40]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT26:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT25]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP41:%.*]] = fmul <1 x double> [[BLOCK24]], [[SPLAT_SPLAT26]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = fadd <1 x double> [[TMP39]], [[TMP41]]
 ; CHECK-NEXT:    [[BLOCK27:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT28:%.*]] = insertelement <1 x double> undef, double [[TMP43]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT29:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT28]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP44:%.*]] = fmul <1 x double> [[BLOCK27]], [[SPLAT_SPLAT29]]
@@ -104,18 +108,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <1 x double> [[TMP45]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <3 x double> [[TMP37]], <3 x double> [[TMP46]], <3 x i32> <i32 0, i32 1, i32 3>
 ; CHECK-NEXT:    [[BLOCK30:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT31:%.*]] = insertelement <1 x double> undef, double [[TMP48]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT32:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT31]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = fmul <1 x double> [[BLOCK30]], [[SPLAT_SPLAT32]]
 ; CHECK-NEXT:    [[BLOCK33:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT34:%.*]] = insertelement <1 x double> undef, double [[TMP50]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT35:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT34]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP51:%.*]] = fmul <1 x double> [[BLOCK33]], [[SPLAT_SPLAT35]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = fadd <1 x double> [[TMP49]], [[TMP51]]
 ; CHECK-NEXT:    [[BLOCK36:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT37:%.*]] = insertelement <1 x double> undef, double [[TMP53]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT38:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT37]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP54:%.*]] = fmul <1 x double> [[BLOCK36]], [[SPLAT_SPLAT38]]
@@ -123,18 +127,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <1 x double> [[TMP55]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP56]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK39:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT40:%.*]] = insertelement <1 x double> undef, double [[TMP58]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT41:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT40]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP59:%.*]] = fmul <1 x double> [[BLOCK39]], [[SPLAT_SPLAT41]]
 ; CHECK-NEXT:    [[BLOCK42:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT43:%.*]] = insertelement <1 x double> undef, double [[TMP60]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT44:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT43]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP61:%.*]] = fmul <1 x double> [[BLOCK42]], [[SPLAT_SPLAT44]]
 ; CHECK-NEXT:    [[TMP62:%.*]] = fadd <1 x double> [[TMP59]], [[TMP61]]
 ; CHECK-NEXT:    [[BLOCK45:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT46:%.*]] = insertelement <1 x double> undef, double [[TMP63]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT47:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT46]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP64:%.*]] = fmul <1 x double> [[BLOCK45]], [[SPLAT_SPLAT47]]
@@ -142,18 +146,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <1 x double> [[TMP65]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <3 x double> [[TMP57]], <3 x double> [[TMP66]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK48:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT49:%.*]] = insertelement <1 x double> undef, double [[TMP68]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT49]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP69:%.*]] = fmul <1 x double> [[BLOCK48]], [[SPLAT_SPLAT50]]
 ; CHECK-NEXT:    [[BLOCK51:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT52:%.*]] = insertelement <1 x double> undef, double [[TMP70]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT52]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP71:%.*]] = fmul <1 x double> [[BLOCK51]], [[SPLAT_SPLAT53]]
 ; CHECK-NEXT:    [[TMP72:%.*]] = fadd <1 x double> [[TMP69]], [[TMP71]]
 ; CHECK-NEXT:    [[BLOCK54:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT55:%.*]] = insertelement <1 x double> undef, double [[TMP73]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT55]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP74:%.*]] = fmul <1 x double> [[BLOCK54]], [[SPLAT_SPLAT56]]
@@ -161,18 +165,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP76:%.*]] = shufflevector <1 x double> [[TMP75]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP77:%.*]] = shufflevector <3 x double> [[TMP67]], <3 x double> [[TMP76]], <3 x i32> <i32 0, i32 1, i32 3>
 ; CHECK-NEXT:    [[BLOCK57:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT58:%.*]] = insertelement <1 x double> undef, double [[TMP78]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT58]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP79:%.*]] = fmul <1 x double> [[BLOCK57]], [[SPLAT_SPLAT59]]
 ; CHECK-NEXT:    [[BLOCK60:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT61:%.*]] = insertelement <1 x double> undef, double [[TMP80]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT62:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT61]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP81:%.*]] = fmul <1 x double> [[BLOCK60]], [[SPLAT_SPLAT62]]
 ; CHECK-NEXT:    [[TMP82:%.*]] = fadd <1 x double> [[TMP79]], [[TMP81]]
 ; CHECK-NEXT:    [[BLOCK63:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT64:%.*]] = insertelement <1 x double> undef, double [[TMP83]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT65:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT64]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP84:%.*]] = fmul <1 x double> [[BLOCK63]], [[SPLAT_SPLAT65]]
@@ -180,18 +184,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP86:%.*]] = shufflevector <1 x double> [[TMP85]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP87:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP86]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK66:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT67:%.*]] = insertelement <1 x double> undef, double [[TMP88]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT68:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT67]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP89:%.*]] = fmul <1 x double> [[BLOCK66]], [[SPLAT_SPLAT68]]
 ; CHECK-NEXT:    [[BLOCK69:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT70:%.*]] = insertelement <1 x double> undef, double [[TMP90]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT71:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT70]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP91:%.*]] = fmul <1 x double> [[BLOCK69]], [[SPLAT_SPLAT71]]
 ; CHECK-NEXT:    [[TMP92:%.*]] = fadd <1 x double> [[TMP89]], [[TMP91]]
 ; CHECK-NEXT:    [[BLOCK72:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT73:%.*]] = insertelement <1 x double> undef, double [[TMP93]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT74:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT73]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP94:%.*]] = fmul <1 x double> [[BLOCK72]], [[SPLAT_SPLAT74]]
@@ -199,18 +203,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP96:%.*]] = shufflevector <1 x double> [[TMP95]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP97:%.*]] = shufflevector <3 x double> [[TMP87]], <3 x double> [[TMP96]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK75:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT76:%.*]] = insertelement <1 x double> undef, double [[TMP98]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT77:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT76]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP99:%.*]] = fmul <1 x double> [[BLOCK75]], [[SPLAT_SPLAT77]]
 ; CHECK-NEXT:    [[BLOCK78:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT79:%.*]] = insertelement <1 x double> undef, double [[TMP100]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT80:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT79]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP101:%.*]] = fmul <1 x double> [[BLOCK78]], [[SPLAT_SPLAT80]]
 ; CHECK-NEXT:    [[TMP102:%.*]] = fadd <1 x double> [[TMP99]], [[TMP101]]
 ; CHECK-NEXT:    [[BLOCK81:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT82:%.*]] = insertelement <1 x double> undef, double [[TMP103]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT83:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT82]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP104:%.*]] = fmul <1 x double> [[BLOCK81]], [[SPLAT_SPLAT83]]
@@ -248,59 +252,61 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-LABEL: @transpose_multiply_add(
 ; CHECK-NEXT:  entry:
 
-; Load input matrixes %A and %B.
-
-; CHECK-NEXT:    [[A:%.*]] = load <9 x double>, <9 x double>* [[A_PTR:%.*]]
-; CHECK-NEXT:    [[B:%.*]] = load <9 x double>, <9 x double>* [[B_PTR:%.*]]
-
-; Extract columns from loaded value %A.
-
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST]], align 8
+; CHECK-NEXT:    [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 3
+; CHECK-NEXT:    [[COL_CAST1:%.*]] = bitcast double* [[COL_GEP]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST1]], align 8
+; CHECK-NEXT:    [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 6
+; CHECK-NEXT:    [[COL_CAST4:%.*]] = bitcast double* [[COL_GEP3]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST4]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST6]], align 8
+; CHECK-NEXT:    [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i32 3
+; CHECK-NEXT:    [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP8]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST9]], align 8
+; CHECK-NEXT:    [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i32 6
+; CHECK-NEXT:    [[COL_CAST12:%.*]] = bitcast double* [[COL_GEP11]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST12]], align 8
 
 ; Transpose %A.
 
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x double> undef, double [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x double> [[TMP1]], double [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x double> undef, double [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x double> [[TMP7]], double [[TMP8]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <3 x double> undef, double [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <3 x double> [[TMP13]], double [[TMP14]], i64 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 2
 
-; Extract columns from %B.
-
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
-
 ; Lower multiply(transpose(%A), %B)
 
 ; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[BLOCK6:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT7:%.*]] = insertelement <1 x double> undef, double [[TMP20]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT8:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT7]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <1 x double> [[BLOCK6]], [[SPLAT_SPLAT8]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = fadd <1 x double> [[TMP19]], [[TMP21]]
 ; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> undef, double [[TMP23]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fmul <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
@@ -308,18 +314,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x double> [[TMP25]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP26]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> undef, double [[TMP28]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP29:%.*]] = fmul <1 x double> [[BLOCK12]], [[SPLAT_SPLAT14]]
 ; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> undef, double [[TMP30]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = fadd <1 x double> [[TMP29]], [[TMP31]]
 ; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> undef, double [[TMP33]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = fmul <1 x double> [[BLOCK18]], [[SPLAT_SPLAT20]]
@@ -327,18 +333,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <1 x double> [[TMP35]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <3 x double> [[TMP27]], <3 x double> [[TMP36]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT22:%.*]] = insertelement <1 x double> undef, double [[TMP38]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT23:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT22]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP39:%.*]] = fmul <1 x double> [[BLOCK21]], [[SPLAT_SPLAT23]]
 ; CHECK-NEXT:    [[BLOCK24:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT25:%.*]] = insertelement <1 x double> undef, double [[TMP40]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT26:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT25]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP41:%.*]] = fmul <1 x double> [[BLOCK24]], [[SPLAT_SPLAT26]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = fadd <1 x double> [[TMP39]], [[TMP41]]
 ; CHECK-NEXT:    [[BLOCK27:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT28:%.*]] = insertelement <1 x double> undef, double [[TMP43]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT29:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT28]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP44:%.*]] = fmul <1 x double> [[BLOCK27]], [[SPLAT_SPLAT29]]
@@ -346,18 +352,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <1 x double> [[TMP45]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <3 x double> [[TMP37]], <3 x double> [[TMP46]], <3 x i32> <i32 0, i32 1, i32 3>
 ; CHECK-NEXT:    [[BLOCK30:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT31:%.*]] = insertelement <1 x double> undef, double [[TMP48]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT32:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT31]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = fmul <1 x double> [[BLOCK30]], [[SPLAT_SPLAT32]]
 ; CHECK-NEXT:    [[BLOCK33:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT34:%.*]] = insertelement <1 x double> undef, double [[TMP50]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT35:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT34]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP51:%.*]] = fmul <1 x double> [[BLOCK33]], [[SPLAT_SPLAT35]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = fadd <1 x double> [[TMP49]], [[TMP51]]
 ; CHECK-NEXT:    [[BLOCK36:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT37:%.*]] = insertelement <1 x double> undef, double [[TMP53]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT38:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT37]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP54:%.*]] = fmul <1 x double> [[BLOCK36]], [[SPLAT_SPLAT38]]
@@ -365,18 +371,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <1 x double> [[TMP55]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP56]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK39:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT40:%.*]] = insertelement <1 x double> undef, double [[TMP58]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT41:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT40]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP59:%.*]] = fmul <1 x double> [[BLOCK39]], [[SPLAT_SPLAT41]]
 ; CHECK-NEXT:    [[BLOCK42:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT43:%.*]] = insertelement <1 x double> undef, double [[TMP60]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT44:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT43]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP61:%.*]] = fmul <1 x double> [[BLOCK42]], [[SPLAT_SPLAT44]]
 ; CHECK-NEXT:    [[TMP62:%.*]] = fadd <1 x double> [[TMP59]], [[TMP61]]
 ; CHECK-NEXT:    [[BLOCK45:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT46:%.*]] = insertelement <1 x double> undef, double [[TMP63]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT47:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT46]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP64:%.*]] = fmul <1 x double> [[BLOCK45]], [[SPLAT_SPLAT47]]
@@ -384,18 +390,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <1 x double> [[TMP65]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <3 x double> [[TMP57]], <3 x double> [[TMP66]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK48:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT49:%.*]] = insertelement <1 x double> undef, double [[TMP68]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT49]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP69:%.*]] = fmul <1 x double> [[BLOCK48]], [[SPLAT_SPLAT50]]
 ; CHECK-NEXT:    [[BLOCK51:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT52:%.*]] = insertelement <1 x double> undef, double [[TMP70]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT52]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP71:%.*]] = fmul <1 x double> [[BLOCK51]], [[SPLAT_SPLAT53]]
 ; CHECK-NEXT:    [[TMP72:%.*]] = fadd <1 x double> [[TMP69]], [[TMP71]]
 ; CHECK-NEXT:    [[BLOCK54:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT55:%.*]] = insertelement <1 x double> undef, double [[TMP73]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT55]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP74:%.*]] = fmul <1 x double> [[BLOCK54]], [[SPLAT_SPLAT56]]
@@ -403,18 +409,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP76:%.*]] = shufflevector <1 x double> [[TMP75]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP77:%.*]] = shufflevector <3 x double> [[TMP67]], <3 x double> [[TMP76]], <3 x i32> <i32 0, i32 1, i32 3>
 ; CHECK-NEXT:    [[BLOCK57:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT58:%.*]] = insertelement <1 x double> undef, double [[TMP78]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT58]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP79:%.*]] = fmul <1 x double> [[BLOCK57]], [[SPLAT_SPLAT59]]
 ; CHECK-NEXT:    [[BLOCK60:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT61:%.*]] = insertelement <1 x double> undef, double [[TMP80]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT62:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT61]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP81:%.*]] = fmul <1 x double> [[BLOCK60]], [[SPLAT_SPLAT62]]
 ; CHECK-NEXT:    [[TMP82:%.*]] = fadd <1 x double> [[TMP79]], [[TMP81]]
 ; CHECK-NEXT:    [[BLOCK63:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT64:%.*]] = insertelement <1 x double> undef, double [[TMP83]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT65:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT64]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP84:%.*]] = fmul <1 x double> [[BLOCK63]], [[SPLAT_SPLAT65]]
@@ -422,18 +428,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP86:%.*]] = shufflevector <1 x double> [[TMP85]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP87:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP86]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK66:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT67:%.*]] = insertelement <1 x double> undef, double [[TMP88]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT68:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT67]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP89:%.*]] = fmul <1 x double> [[BLOCK66]], [[SPLAT_SPLAT68]]
 ; CHECK-NEXT:    [[BLOCK69:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT70:%.*]] = insertelement <1 x double> undef, double [[TMP90]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT71:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT70]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP91:%.*]] = fmul <1 x double> [[BLOCK69]], [[SPLAT_SPLAT71]]
 ; CHECK-NEXT:    [[TMP92:%.*]] = fadd <1 x double> [[TMP89]], [[TMP91]]
 ; CHECK-NEXT:    [[BLOCK72:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT73:%.*]] = insertelement <1 x double> undef, double [[TMP93]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT74:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT73]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP94:%.*]] = fmul <1 x double> [[BLOCK72]], [[SPLAT_SPLAT74]]
@@ -441,18 +447,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP96:%.*]] = shufflevector <1 x double> [[TMP95]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP97:%.*]] = shufflevector <3 x double> [[TMP87]], <3 x double> [[TMP96]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK75:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT76:%.*]] = insertelement <1 x double> undef, double [[TMP98]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT77:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT76]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP99:%.*]] = fmul <1 x double> [[BLOCK75]], [[SPLAT_SPLAT77]]
 ; CHECK-NEXT:    [[BLOCK78:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT79:%.*]] = insertelement <1 x double> undef, double [[TMP100]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT80:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT79]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP101:%.*]] = fmul <1 x double> [[BLOCK78]], [[SPLAT_SPLAT80]]
 ; CHECK-NEXT:    [[TMP102:%.*]] = fadd <1 x double> [[TMP99]], [[TMP101]]
 ; CHECK-NEXT:    [[BLOCK81:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT82:%.*]] = insertelement <1 x double> undef, double [[TMP103]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT83:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT82]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP104:%.*]] = fmul <1 x double> [[BLOCK81]], [[SPLAT_SPLAT83]]
@@ -465,19 +471,21 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 
 ; Load %C.
 
-; CHECK-NEXT:    [[C:%.*]] = load <9 x double>, <9 x double>* [[C_PTR:%.*]]
-
-; Extract columns from %C.
-
-; CHECK-NEXT:    [[SPLIT84:%.*]] = shufflevector <9 x double> [[C]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT85:%.*]] = shufflevector <9 x double> [[C]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT86:%.*]] = shufflevector <9 x double> [[C]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP110:%.*]] = bitcast <9 x double>* [[C_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST92:%.*]] = bitcast double* [[TMP110]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD93:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST92]], align 8
+; CHECK-NEXT:    [[COL_GEP94:%.*]] = getelementptr double, double* [[TMP110]], i32 3
+; CHECK-NEXT:    [[COL_CAST95:%.*]] = bitcast double* [[COL_GEP94]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD96:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST95]], align 8
+; CHECK-NEXT:    [[COL_GEP97:%.*]] = getelementptr double, double* [[TMP110]], i32 6
+; CHECK-NEXT:    [[COL_CAST98:%.*]] = bitcast double* [[COL_GEP97]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD99:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST98]], align 8
 
 ; Add column vectors.
 
-; CHECK-NEXT:    [[TMP108:%.*]] = fadd <3 x double> [[SPLIT84]], [[TMP47]]
-; CHECK-NEXT:    [[TMP109:%.*]] = fadd <3 x double> [[SPLIT85]], [[TMP77]]
-; CHECK-NEXT:    [[TMP110:%.*]] = fadd <3 x double> [[SPLIT86]], [[TMP107]]
+; CHECK-NEXT:    [[TMP108:%.*]] = fadd <3 x double> [[COL_LOAD93]], [[TMP47]]
+; CHECK-NEXT:    [[TMP109:%.*]] = fadd <3 x double> [[COL_LOAD96]], [[TMP77]]
+; CHECK-NEXT:    [[TMP110:%.*]] = fadd <3 x double> [[COL_LOAD99]], [[TMP107]]
 
 ; Store result columns.
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
new file mode 100644
index 0000000000000..89ca79649b879
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck  %s
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+define <8 x double> @fadd_transpose(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @fadd_transpose(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[SPLIT]], [[SPLIT4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[SPLIT1]], [[SPLIT5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[SPLIT2]], [[SPLIT6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[SPLIT3]], [[SPLIT7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> undef, double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP16]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x double> [[TMP20]]
+;
+entry:
+  %add = fadd <8 x double> %a, %b
+  %c  = call <8 x double> @llvm.matrix.transpose(<8 x double> %add, i32 2, i32 4)
+  ret <8 x double> %c
+}
+
+define <8 x double> @load_fadd_transpose(<8 x double>* %A.Ptr, <8 x double> %b) {
+; CHECK-LABEL: @load_fadd_transpose(
+; CHECK-NEXT:  entry:
+
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x double>* [[A_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP0]] to <2 x double>*
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST]], align 8
+; CHECK-NEXT:    [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 2
+; CHECK-NEXT:    [[COL_CAST1:%.*]] = bitcast double* [[COL_GEP]] to <2 x double>*
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST1]], align 8
+; CHECK-NEXT:    [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 4
+; CHECK-NEXT:    [[COL_CAST4:%.*]] = bitcast double* [[COL_GEP3]] to <2 x double>*
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST4]], align 8
+; CHECK-NEXT:    [[COL_GEP6:%.*]] = getelementptr double, double* [[TMP0]], i32 6
+; CHECK-NEXT:    [[COL_CAST7:%.*]] = bitcast double* [[COL_GEP6]] to <2 x double>*
+; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST7]], align 8
+; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 6, i32 7>
+
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[COL_LOAD]], [[SPLIT4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[COL_LOAD2]], [[SPLIT5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[COL_LOAD5]], [[SPLIT6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[COL_LOAD8]], [[SPLIT7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> undef, double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP16]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x double> [[TMP20]]
+;
+entry:
+  %a = load <8 x double>, <8 x double>* %A.Ptr
+  %add = fadd <8 x double> %a, %b
+  %c  = call <8 x double> @llvm.matrix.transpose(<8 x double> %add, i32 2, i32 4)
+  ret <8 x double> %c
+}
+
+declare <8 x double> @llvm.matrix.transpose(<8 x double>, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll
new file mode 100644
index 0000000000000..591cddd261cf6
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+; Check that we we use flattened vectors for PHI operands and extract the columns afterwards.
+define <9 x double> @unsupported_phi(i1 %cond, <9 x double> %A, <9 x double> %B, <9 x double> %C) {
+; CHECK-LABEL: @unsupported_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+
+; CHECK-LABEL: if.then:
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <9 x double> [[A:%.*]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x double> undef, double [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x double> [[TMP1]], double [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x double> undef, double [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x double> [[TMP7]], double [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <3 x double> undef, double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <3 x double> [[TMP13]], double [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> [[TMP11]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <6 x double> [[TMP18]], <6 x double> [[TMP19]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+
+; CHECK-LABEL: if.else:
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[B:%.*]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x double> undef, double [[TMP21]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x double> [[TMP22]], double [[TMP23]], i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x double> [[TMP24]], double [[TMP25]], i64 2
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <3 x double> undef, double [[TMP27]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <3 x double> [[TMP28]], double [[TMP29]], i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <3 x double> [[TMP30]], double [[TMP31]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <3 x double> undef, double [[TMP33]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <3 x double> [[TMP34]], double [[TMP35]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <3 x double> [[TMP36]], double [[TMP37]], i64 2
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <3 x double> [[TMP26]], <3 x double> [[TMP32]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <3 x double> [[TMP38]], <3 x double> undef, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <6 x double> [[TMP39]], <6 x double> [[TMP40]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    br label [[IF_END]]
+
+; CHECK-LABEL: if.end:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi <9 x double> [ [[TMP20]], [[IF_THEN]] ], [ [[TMP41]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <9 x double> [[C:%.*]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <9 x double> [[C]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT8:%.*]] = shufflevector <9 x double> [[C]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[SPLIT9:%.*]] = shufflevector <9 x double> [[MERGE]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT10:%.*]] = shufflevector <9 x double> [[MERGE]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT11:%.*]] = shufflevector <9 x double> [[MERGE]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x double> [[SPLIT6]], <3 x double> undef, <1 x i32> zeroinitializer
+;
+entry:
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:                    ; preds = %entry
+  %A.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %A, i32 3, i32 3)
+  br label %if.end
+
+if.else:                                       ; preds = %entry
+  %B.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %B, i32 3, i32 3)
+  br label %if.end
+
+if.end:                                        ; preds = %if.then, %if.else
+  %merge = phi <9 x double> [ %A.trans, %if.then], [ %B.trans, %if.else ]
+  %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %C, <9 x double> %merge, i32 3, i32 3, i32 3)
+  ret <9 x double> %res
+}
+
+; Make sure we use a flattened vector when calling @foo and the use its flat vector result properly.
+define <9 x double> @unsupported_call(i1 %cond, <9 x double> %A, <9 x double> %B) {
+; CHECK-LABEL: @unsupported_call(
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[A:%.*]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x double> undef, double [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x double> [[TMP2]], double [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <3 x double> [[TMP4]], double [[TMP5]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <3 x double> undef, double [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <3 x double> [[TMP8]], double [[TMP9]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <3 x double> [[TMP10]], double [[TMP11]], i64 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <3 x double> undef, double [[TMP13]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <3 x double> [[TMP14]], double [[TMP15]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <3 x double> [[TMP16]], double [[TMP17]], i64 2
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <3 x double> [[TMP6]], <3 x double> [[TMP12]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <3 x double> [[TMP18]], <3 x double> undef, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <6 x double> [[TMP19]], <6 x double> [[TMP20]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[A_FOO:%.*]] = call <9 x double> @foo(<9 x double> [[TMP21]])
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <9 x double> [[B:%.*]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <9 x double> [[A_FOO]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <9 x double> [[A_FOO]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT8:%.*]] = shufflevector <9 x double> [[A_FOO]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+;
+  %A.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %A, i32 3, i32 3)
+  %A.foo = call <9 x double> @foo(<9 x double> %A.trans)
+  %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %B, <9 x double> %A.foo, i32 3, i32 3, i32 3)
+  ret <9 x double> %res
+}
+
+declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32 immarg, i32 immarg, i32 immarg)
+declare <9 x double> @llvm.matrix.transpose.v9f64(<9 x double>, i32 immarg, i32 immarg)
+declare <9 x double> @foo(<9 x double>)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll
new file mode 100644
index 0000000000000..38200b3883dc0
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+
+; Make sure we propagate in multiple iterations. First, we back-propagate the
+; shape information from the transpose to %A, in the next iteration we
+; forward-propagate it to %Mul, and then back to %B.
+define <16 x double> @backpropagation_iterations(<16 x double>* %A.Ptr, <16 x double>* %B.Ptr) {
+; CHECK-LABEL: @backpropagation_iterations(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x double>* [[A_PTR:%.*]] to double*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <4 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, double* [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[TMP5]] to <4 x double>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x double>, <4 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr double, double* [[TMP1]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[TMP9]] to <4 x double>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x double>, <4 x double>* [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP1]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>*
+; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x double>, <4 x double>* [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x double> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> undef, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x double> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x double> [[TMP11]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x double> [[TMP19]], double [[TMP20]], i64 2
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x double> [[TMP15]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> [[TMP21]], double [[TMP22]], i64 3
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x double> undef, double [[TMP24]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x double> [[TMP7]], i64 1
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> [[TMP25]], double [[TMP26]], i64 1
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x double> [[TMP11]], i64 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x double> [[TMP27]], double [[TMP28]], i64 2
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x double> [[TMP15]], i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x double> [[TMP29]], double [[TMP30]], i64 3
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x double> undef, double [[TMP32]], i64 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x double> [[TMP7]], i64 2
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x double> [[TMP33]], double [[TMP34]], i64 1
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x double> [[TMP11]], i64 2
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x double> [[TMP35]], double [[TMP36]], i64 2
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x double> [[TMP15]], i64 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x double> [[TMP37]], double [[TMP38]], i64 3
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x double> [[TMP3]], i64 3
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x double> undef, double [[TMP40]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x double> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x double> [[TMP41]], double [[TMP42]], i64 1
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x double> [[TMP11]], i64 3
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x double> [[TMP43]], double [[TMP44]], i64 2
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x double> [[TMP15]], i64 3
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x double> [[TMP45]], double [[TMP46]], i64 3
+; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x double>* [[B_PTR:%.*]] to double*
+; CHECK-NEXT:    [[TMP49:%.*]] = bitcast double* [[TMP48]] to <4 x double>*
+; CHECK-NEXT:    [[TMP50:%.*]] = load <4 x double>, <4 x double>* [[TMP49]], align 8
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr double, double* [[TMP48]], i32 4
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>*
+; CHECK-NEXT:    [[TMP54:%.*]] = load <4 x double>, <4 x double>* [[TMP53]], align 8
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP48]], i32 8
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
+; CHECK-NEXT:    [[TMP58:%.*]] = load <4 x double>, <4 x double>* [[TMP57]], align 8
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP48]], i32 12
+; CHECK-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
+; CHECK-NEXT:    [[TMP62:%.*]] = load <4 x double>, <4 x double>* [[TMP61]], align 8
+; CHECK-NEXT:    [[TMP63:%.*]] = fmul <4 x double> [[TMP3]], [[TMP50]]
+; CHECK-NEXT:    [[TMP64:%.*]] = fmul <4 x double> [[TMP7]], [[TMP54]]
+; CHECK-NEXT:    [[TMP65:%.*]] = fmul <4 x double> [[TMP11]], [[TMP58]]
+; CHECK-NEXT:    [[TMP66:%.*]] = fmul <4 x double> [[TMP15]], [[TMP62]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <4 x double> [[TMP63]], <4 x double> [[TMP64]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <4 x double> [[TMP65]], <4 x double> [[TMP66]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP69:%.*]] = shufflevector <8 x double> [[TMP67]], <8 x double> [[TMP68]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x double> [[TMP69]]
+;
+  %A = load <16 x double>, <16 x double>* %A.Ptr
+  %A.trans = tail call <16 x double> @llvm.matrix.transpose.v16f64(<16 x double> %A, i32 4, i32 4)
+  %B = load <16 x double>, <16 x double>* %B.Ptr
+  %Mul = fmul <16 x double> %A, %B
+  ret <16 x double> %Mul
+}
+
+declare <16 x double> @llvm.matrix.multiply.v16f64.v16f64.v16f64(<16 x double>, <16 x double>, i32 immarg, i32 immarg, i32 immarg)
+declare <16 x double> @llvm.matrix.transpose.v16f64(<16 x double>, i32 immarg, i32 immarg)
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index adbf1b3b8c608..65b831c96e8f6 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -22,15 +22,18 @@ using namespace llvm;
 
 static std::string convertToErrorFromString(StringRef Str) {
   llvm::APFloat F(0.0);
-  auto ErrOrStatus =
+  auto StatusOrErr =
       F.convertFromString(Str, llvm::APFloat::rmNearestTiesToEven);
-  EXPECT_TRUE(!ErrOrStatus);
-  return toString(ErrOrStatus.takeError());
+  EXPECT_TRUE(!StatusOrErr);
+  return toString(StatusOrErr.takeError());
 }
 
 static double convertToDoubleFromString(StringRef Str) {
   llvm::APFloat F(0.0);
-  EXPECT_FALSE(!F.convertFromString(Str, llvm::APFloat::rmNearestTiesToEven));
+  auto StatusOrErr =
+      F.convertFromString(Str, llvm::APFloat::rmNearestTiesToEven);
+  EXPECT_FALSE(!StatusOrErr);
+  consumeError(StatusOrErr.takeError());
   return F.convertToDouble();
 }
 
diff --git a/llvm/unittests/ADT/TripleTest.cpp b/llvm/unittests/ADT/TripleTest.cpp
index d8123bbbfdf7a..ef7f82d268e2e 100644
--- a/llvm/unittests/ADT/TripleTest.cpp
+++ b/llvm/unittests/ADT/TripleTest.cpp
@@ -163,6 +163,13 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::UnknownOS, T.getOS());
   EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
 
+  T = Triple("powerpcspe-unknown-freebsd");
+  EXPECT_EQ(Triple::ppc, T.getArch());
+  EXPECT_EQ(Triple::PPCSubArch_spe, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::FreeBSD, T.getOS());
+  EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
+
   T = Triple("arm-none-none-eabi");
   EXPECT_EQ(Triple::arm, T.getArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
@@ -312,6 +319,12 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::AMDPAL, T.getOS());
   EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
 
+  T = Triple("ve-unknown-linux");
+  EXPECT_EQ(Triple::ve, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::Linux, T.getOS());
+  EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
+
   T = Triple("riscv32-unknown-unknown");
   EXPECT_EQ(Triple::riscv32, T.getArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
@@ -715,6 +728,8 @@ TEST(TripleTest, Normalization) {
             Triple::normalize("i686-linux")); // i686-pc-linux-gnu
   EXPECT_EQ("arm-none-unknown-eabi",
             Triple::normalize("arm-none-eabi")); // arm-none-eabi
+  EXPECT_EQ("ve-unknown-linux",
+            Triple::normalize("ve-linux")); // ve-linux
   EXPECT_EQ("wasm32-unknown-wasi",
             Triple::normalize("wasm32-wasi")); // wasm32-unknown-wasi
   EXPECT_EQ("wasm64-unknown-wasi",
diff --git a/llvm/unittests/CodeGen/MachineOperandTest.cpp b/llvm/unittests/CodeGen/MachineOperandTest.cpp
index faa471f2260c7..7e60fab281545 100644
--- a/llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ b/llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -310,7 +310,7 @@ TEST(MachineOperandTest, PrintMetadata) {
   std::string str;
   // Print a MachineOperand containing a metadata node.
   raw_string_ostream OS(str);
-  MO.print(OS, MST, LLT{}, /*PrintDef=*/false, /*IsStandalone=*/false,
+  MO.print(OS, MST, LLT{}, /*OpIdx*/~0U, /*PrintDef=*/false, /*IsStandalone=*/false,
            /*ShouldPrintRegisterTies=*/false, 0, /*TRI=*/nullptr,
            /*IntrinsicInfo=*/nullptr);
   ASSERT_TRUE(OS.str() == "!0");
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.h b/llvm/utils/TableGen/CodeGenDAGPatterns.h
index c61b55052533a..2c081b670609d 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.h
@@ -1195,12 +1195,6 @@ class CodeGenDAGPatterns {
     return F->second;
   }
 
-  typedef std::map<Record*, NodeXForm, LessRecordByID>::const_iterator
-          nx_iterator;
-  nx_iterator nx_begin() const { return SDNodeXForms.begin(); }
-  nx_iterator nx_end() const { return SDNodeXForms.end(); }
-
-
   const ComplexPattern &getComplexPattern(Record *R) const {
     auto F = ComplexPatterns.find(R);
     assert(F != ComplexPatterns.end() && "Unknown addressing mode!");
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index b2e3903eda8b5..99b067d5b5270 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -5283,7 +5283,7 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
 
   OS << Target.getName() << "InstructionSelector::CustomRendererFn\n"
      << Target.getName() << "InstructionSelector::CustomRenderers[] = {\n"
-     << "  nullptr, // GICP_Invalid\n";
+     << "  nullptr, // GICR_Invalid\n";
   for (const auto &Record : CustomRendererFns)
     OS << "  &" << Target.getName()
        << "InstructionSelector::" << Record->getValueAsString("RendererFn")
diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
index 1776ff7198052..a21b51487722e 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -53,6 +53,11 @@ class GPUDialect : public Dialect {
   /// 'gpu.kernel' attribute.
   static bool isKernel(Operation *op);
 
+  /// Returns the number of workgroup (thread, block) dimensions supported in
+  /// the GPU dialect.
+  // TODO(zinenko,herhut): consider generalizing this.
+  static unsigned getNumWorkgroupDimensions() { return 3; }
+
   /// Returns the numeric value used to identify the workgroup memory address
   /// space.
   static unsigned getWorkgroupAddressSpace() { return 3; }
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index b5b93e9b553b5..766ddbf202c25 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -117,6 +117,10 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
   ];
 
   let extraClassDeclaration = [{
+    /// Adds a workgroup attribution of the MemRef type with the given shape and
+    /// element type.
+    Value addWorkgroupAttribution(ArrayRef<int64_t> shape, Type elementType);
+
     /// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
     /// it is intended to be launched from host.
     bool isKernel() {
diff --git a/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
new file mode 100644
index 0000000000000..09c1371708f25
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
@@ -0,0 +1,29 @@
+//===- MemoryPromotion.h - Utilities for moving data across GPU -*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares the utility functions that generate IR copying
+// the data between different levels of memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+#define MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+
+namespace mlir {
+
+namespace gpu {
+class GPUFuncOp;
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_MEMORYPROMOTION_H
diff --git a/mlir/include/mlir/IR/Attributes.h b/mlir/include/mlir/IR/Attributes.h
index b8398580f61c9..64b8063bdcb58 100644
--- a/mlir/include/mlir/IR/Attributes.h
+++ b/mlir/include/mlir/IR/Attributes.h
@@ -215,6 +215,25 @@ class ArrayAttr : public Attribute::AttrBase<ArrayAttr, Attribute,
   static bool kindof(unsigned kind) {
     return kind == StandardAttributes::Array;
   }
+
+private:
+  /// Class for underlying value iterator support.
+  template <typename AttrTy>
+  class attr_value_iterator final
+      : public llvm::mapped_iterator<iterator, AttrTy (*)(Attribute)> {
+  public:
+    explicit attr_value_iterator(iterator it)
+        : llvm::mapped_iterator<iterator, AttrTy (*)(Attribute)>(
+              it, [](Attribute attr) { return attr.cast<AttrTy>(); }) {}
+    AttrTy operator*() { return (*this->I).template cast<AttrTy>(); }
+  };
+
+public:
+  template <typename AttrTy>
+  llvm::iterator_range<attr_value_iterator<AttrTy>> getAsRange() {
+    return llvm::make_range(attr_value_iterator<AttrTy>(begin()),
+                            attr_value_iterator<AttrTy>(end()));
+  }
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/Block.h b/mlir/include/mlir/IR/Block.h
index c868148f95e2e..2d3eb18d72934 100644
--- a/mlir/include/mlir/IR/Block.h
+++ b/mlir/include/mlir/IR/Block.h
@@ -79,6 +79,11 @@ class Block : public IRObjectWithUseList<BlockOperand>,
   /// Add one value to the argument list.
   BlockArgument addArgument(Type type);
 
+  /// Insert one value to the position in the argument list indicated by the
+  /// given iterator. The existing arguments are shifted. The block is expected
+  /// not to have predecessors.
+  BlockArgument insertArgument(args_iterator it, Type type);
+
   /// Add one argument to the argument list for each type specified in the list.
   iterator_range<args_iterator> addArguments(ArrayRef<Type> types);
 
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index b48930c4ddab0..1fbee9742e0d0 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -6,10 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
-#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
 #include "mlir/Dialect/VectorOps/VectorOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -31,6 +32,7 @@
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace mlir;
+using namespace mlir::vector;
 
 template <typename T>
 static LLVM::LLVMType getPtrToElementType(T containerType,
@@ -68,6 +70,17 @@ static Value insertOne(ConversionPatternRewriter &rewriter,
                                               rewriter.getI64ArrayAttr(pos));
 }
 
+// Helper that picks the proper sequence for inserting.
+static Value insertOne(PatternRewriter &rewriter, Location loc, Value from,
+                       Value into, int64_t offset) {
+  auto vectorType = into.getType().cast<VectorType>();
+  if (vectorType.getRank() > 1)
+    return rewriter.create<InsertOp>(loc, from, into, offset);
+  return rewriter.create<vector::InsertElementOp>(
+      loc, vectorType, from, into,
+      rewriter.create<ConstantIndexOp>(loc, offset));
+}
+
 // Helper that picks the proper sequence for extracting.
 static Value extractOne(ConversionPatternRewriter &rewriter,
                         LLVMTypeConverter &lowering, Location loc, Value val,
@@ -84,6 +97,32 @@ static Value extractOne(ConversionPatternRewriter &rewriter,
                                                rewriter.getI64ArrayAttr(pos));
 }
 
+// Helper that picks the proper sequence for extracting.
+static Value extractOne(PatternRewriter &rewriter, Location loc, Value vector,
+                        int64_t offset) {
+  auto vectorType = vector.getType().cast<VectorType>();
+  if (vectorType.getRank() > 1)
+    return rewriter.create<ExtractOp>(loc, vector, offset);
+  return rewriter.create<vector::ExtractElementOp>(
+      loc, vectorType.getElementType(), vector,
+      rewriter.create<ConstantIndexOp>(loc, offset));
+}
+
+// Helper that returns a subset of `arrayAttr` as a vector of int64_t.
+// TODO(rriddle): Better support for attribute subtype forwarding + slicing.
+static SmallVector<int64_t, 4> getI64SubArray(ArrayAttr arrayAttr,
+                                              unsigned dropFront = 0,
+                                              unsigned dropBack = 0) {
+  assert(arrayAttr.size() > dropFront + dropBack && "Out of bounds");
+  auto range = arrayAttr.getAsRange<IntegerAttr>();
+  SmallVector<int64_t, 4> res;
+  res.reserve(arrayAttr.size() - dropFront - dropBack);
+  for (auto it = range.begin() + dropFront, eit = range.end() - dropBack;
+       it != eit; ++it)
+    res.push_back((*it).getValue().getSExtValue());
+  return res;
+}
+
 class VectorBroadcastOpConversion : public LLVMOpLowering {
 public:
   explicit VectorBroadcastOpConversion(MLIRContext *context,
@@ -462,6 +501,139 @@ class VectorInsertOpConversion : public LLVMOpLowering {
   }
 };
 
+// When ranks are different, InsertStridedSlice needs to extract a properly
+// ranked vector from the destination vector into which to insert. This pattern
+// only takes care of this part and forwards the rest of the conversion to
+// another pattern that converts InsertStridedSlice for operands of the same
+// rank.
+//
+// RewritePattern for InsertStridedSliceOp where source and destination vectors
+// have different ranks. In this case:
+//   1. the proper subvector is extracted from the destination vector
+//   2. a new InsertStridedSlice op is created to insert the source in the
+//   destination subvector
+//   3. the destination subvector is inserted back in the proper place
+//   4. the op is replaced by the result of step 3.
+// The new InsertStridedSlice from step 2. will be picked up by a
+// `VectorInsertStridedSliceOpSameRankRewritePattern`.
+class VectorInsertStridedSliceOpDifferentRankRewritePattern
+    : public OpRewritePattern<InsertStridedSliceOp> {
+public:
+  using OpRewritePattern<InsertStridedSliceOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(InsertStridedSliceOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto srcType = op.getSourceVectorType();
+    auto dstType = op.getDestVectorType();
+
+    if (op.offsets().getValue().empty())
+      return matchFailure();
+
+    auto loc = op.getLoc();
+    int64_t rankDiff = dstType.getRank() - srcType.getRank();
+    assert(rankDiff >= 0);
+    if (rankDiff == 0)
+      return matchFailure();
+
+    int64_t rankRest = dstType.getRank() - rankDiff;
+    // Extract / insert the subvector of matching rank and InsertStridedSlice
+    // on it.
+    Value extracted =
+        rewriter.create<ExtractOp>(loc, op.dest(),
+                                   getI64SubArray(op.offsets(), /*dropFront=*/0,
+                                                  /*dropFront=*/rankRest));
+    // A different pattern will kick in for InsertStridedSlice with matching
+    // ranks.
+    auto stridedSliceInnerOp = rewriter.create<InsertStridedSliceOp>(
+        loc, op.source(), extracted,
+        getI64SubArray(op.offsets(), /*dropFront=*/rankDiff),
+        getI64SubArray(op.strides(), /*dropFront=*/rankDiff));
+    rewriter.replaceOpWithNewOp<InsertOp>(
+        op, stridedSliceInnerOp.getResult(), op.dest(),
+        getI64SubArray(op.offsets(), /*dropFront=*/0,
+                       /*dropFront=*/rankRest));
+    return matchSuccess();
+  }
+};
+
+// RewritePattern for InsertStridedSliceOp where source and destination vectors
+// have the same rank. In this case, we reduce
+//   1. the proper subvector is extracted from the destination vector
+//   2. a new InsertStridedSlice op is created to insert the source in the
+//   destination subvector
+//   3. the destination subvector is inserted back in the proper place
+//   4. the op is replaced by the result of step 3.
+// The new InsertStridedSlice from step 2. will be picked up by a
+// `VectorInsertStridedSliceOpSameRankRewritePattern`.
+class VectorInsertStridedSliceOpSameRankRewritePattern
+    : public OpRewritePattern<InsertStridedSliceOp> {
+public:
+  using OpRewritePattern<InsertStridedSliceOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(InsertStridedSliceOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto srcType = op.getSourceVectorType();
+    auto dstType = op.getDestVectorType();
+
+    if (op.offsets().getValue().empty())
+      return matchFailure();
+
+    int64_t rankDiff = dstType.getRank() - srcType.getRank();
+    assert(rankDiff >= 0);
+    if (rankDiff != 0)
+      return matchFailure();
+
+    if (srcType == dstType) {
+      rewriter.replaceOp(op, op.source());
+      return matchSuccess();
+    }
+
+    int64_t offset =
+        op.offsets().getValue().front().cast<IntegerAttr>().getInt();
+    int64_t size = srcType.getShape().front();
+    int64_t stride =
+        op.strides().getValue().front().cast<IntegerAttr>().getInt();
+
+    auto loc = op.getLoc();
+    Value res = op.dest();
+    // For each slice of the source vector along the most major dimension.
+    for (int64_t off = offset, e = offset + size * stride, idx = 0; off < e;
+         off += stride, ++idx) {
+      // 1. extract the proper subvector (or element) from source
+      Value extractedSource = extractOne(rewriter, loc, op.source(), idx);
+      if (extractedSource.getType().isa<VectorType>()) {
+        // 2. If we have a vector, extract the proper subvector from destination
+        // Otherwise we are at the element level and no need to recurse.
+        Value extractedDest = extractOne(rewriter, loc, op.dest(), off);
+        // 3. Reduce the problem to lowering a new InsertStridedSlice op with
+        // smaller rank.
+        InsertStridedSliceOp insertStridedSliceOp =
+            rewriter.create<InsertStridedSliceOp>(
+                loc, extractedSource, extractedDest,
+                getI64SubArray(op.offsets(), /* dropFront=*/1),
+                getI64SubArray(op.strides(), /* dropFront=*/1));
+        // Call matchAndRewrite recursively from within the pattern. This
+        // circumvents the current limitation that a given pattern cannot
+        // be called multiple times by the PatternRewrite infrastructure (to
+        // avoid infinite recursion, but in this case, infinite recursion
+        // cannot happen because the rank is strictly decreasing).
+        // TODO(rriddle, nicolasvasilache) Implement something like a hook for
+        // a potential function that must decrease and allow the same pattern
+        // multiple times.
+        auto success = matchAndRewrite(insertStridedSliceOp, rewriter);
+        (void)success;
+        assert(success && "Unexpected failure");
+        extractedSource = insertStridedSliceOp;
+      }
+      // 4. Insert the extractedSource into the res vector.
+      res = insertOne(rewriter, loc, extractedSource, res, off);
+    }
+
+    rewriter.replaceOp(op, res);
+    return matchSuccess();
+  }
+};
+
 class VectorOuterProductOpConversion : public LLVMOpLowering {
 public:
   explicit VectorOuterProductOpConversion(MLIRContext *context,
@@ -723,15 +895,71 @@ class VectorPrintOpConversion : public LLVMOpLowering {
   }
 };
 
+/// Progressive lowering of StridedSliceOp to either:
+///   1. extractelement + insertelement for the 1-D case
+///   2. extract + optional strided_slice + insert for the n-D case.
+class VectorStridedSliceOpConversion : public OpRewritePattern<StridedSliceOp> {
+public:
+  using OpRewritePattern<StridedSliceOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(StridedSliceOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto dstType = op.getResult().getType().cast<VectorType>();
+
+    assert(!op.offsets().getValue().empty() && "Unexpected empty offsets");
+
+    int64_t offset =
+        op.offsets().getValue().front().cast<IntegerAttr>().getInt();
+    int64_t size = op.sizes().getValue().front().cast<IntegerAttr>().getInt();
+    int64_t stride =
+        op.strides().getValue().front().cast<IntegerAttr>().getInt();
+
+    auto loc = op.getLoc();
+    auto elemType = dstType.getElementType();
+    assert(elemType.isIntOrIndexOrFloat());
+    Value zero = rewriter.create<ConstantOp>(loc, elemType,
+                                             rewriter.getZeroAttr(elemType));
+    Value res = rewriter.create<SplatOp>(loc, dstType, zero);
+    for (int64_t off = offset, e = offset + size * stride, idx = 0; off < e;
+         off += stride, ++idx) {
+      Value extracted = extractOne(rewriter, loc, op.vector(), off);
+      if (op.offsets().getValue().size() > 1) {
+        StridedSliceOp stridedSliceOp = rewriter.create<StridedSliceOp>(
+            loc, extracted, getI64SubArray(op.offsets(), /* dropFront=*/1),
+            getI64SubArray(op.sizes(), /* dropFront=*/1),
+            getI64SubArray(op.strides(), /* dropFront=*/1));
+        // Call matchAndRewrite recursively from within the pattern. This
+        // circumvents the current limitation that a given pattern cannot
+        // be called multiple times by the PatternRewrite infrastructure (to
+        // avoid infinite recursion, but in this case, infinite recursion
+        // cannot happen because the rank is strictly decreasing).
+        // TODO(rriddle, nicolasvasilache) Implement something like a hook for
+        // a potential function that must decrease and allow the same pattern
+        // multiple times.
+        auto success = matchAndRewrite(stridedSliceOp, rewriter);
+        (void)success;
+        assert(success && "Unexpected failure");
+        extracted = stridedSliceOp;
+      }
+      res = insertOne(rewriter, loc, extracted, res, idx);
+    }
+    rewriter.replaceOp(op, {res});
+    return matchSuccess();
+  }
+};
+
 /// Populate the given list with patterns that convert from Vector to LLVM.
 void mlir::populateVectorToLLVMConversionPatterns(
     LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  MLIRContext *ctx = converter.getDialect()->getContext();
+  patterns.insert<VectorInsertStridedSliceOpDifferentRankRewritePattern,
+                  VectorInsertStridedSliceOpSameRankRewritePattern,
+                  VectorStridedSliceOpConversion>(ctx);
   patterns.insert<VectorBroadcastOpConversion, VectorShuffleOpConversion,
                   VectorExtractElementOpConversion, VectorExtractOpConversion,
                   VectorInsertElementOpConversion, VectorInsertOpConversion,
                   VectorOuterProductOpConversion, VectorTypeCastOpConversion,
-                  VectorPrintOpConversion>(converter.getDialect()->getContext(),
-                                           converter);
+                  VectorPrintOpConversion>(ctx, converter);
 }
 
 namespace {
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 6fe45ba49ef56..dbf05ac6ace95 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -2,9 +2,25 @@ add_llvm_library(MLIRGPU
   IR/GPUDialect.cpp
   IR/DialectRegistration.cpp
   Transforms/KernelOutlining.cpp
+  Transforms/MemoryPromotion.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
 )
-add_dependencies(MLIRGPU MLIRGPUOpsIncGen MLIRIR MLIRLLVMIR LLVMSupport)
-target_link_libraries(MLIRGPU MLIRIR MLIRLLVMIR MLIRStandardOps LLVMSupport)
+add_dependencies(MLIRGPU
+  MLIRGPUOpsIncGen
+  MLIREDSC
+  MLIRIR
+  MLIRLLVMIR
+  MLIRLoopOps
+  MLIRSupport
+  MLIRTransformUtils
+  LLVMSupport)
+target_link_libraries(MLIRGPU
+  MLIREDSC
+  MLIRIR
+  MLIRLLVMIR
+  MLIRLoopOps
+  MLIRSupport
+  MLIRTransformUtils
+  LLVMSupport)
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index bda8032fc21a6..32d7fae65d9ce 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -593,6 +593,24 @@ LogicalResult verify(LaunchFuncOp op) {
 // GPUFuncOp
 //===----------------------------------------------------------------------===//
 
+/// Adds a workgroup attribution to "op" of the MemRef type with the given shape
+/// and element type.
+Value GPUFuncOp::addWorkgroupAttribution(ArrayRef<int64_t> shape,
+                                         Type elementType) {
+  unsigned pos = getNumFuncArguments() + getNumWorkgroupAttributions();
+  Block &bodyBlock = body().front();
+  Value attribution = bodyBlock.insertArgument(
+      std::next(bodyBlock.args_begin(), pos),
+      MemRefType::get(shape, elementType, /*affineMapComposition=*/{},
+                      GPUDialect::getWorkgroupAddressSpace()));
+  auto numWorkgroupBuffersAttr =
+      getAttrOfType<IntegerAttr>(getNumWorkgroupAttributionsAttrName());
+  setAttr(getNumWorkgroupAttributionsAttrName(),
+          IntegerAttr::get(numWorkgroupBuffersAttr.getType(),
+                           numWorkgroupBuffersAttr.getValue() + 1));
+  return attribution;
+}
+
 void GPUFuncOp::build(Builder *builder, OperationState &result, StringRef name,
                       FunctionType type, ArrayRef<Type> workgroupAttributions,
                       ArrayRef<Type> privateAttributions,
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
new file mode 100644
index 0000000000000..f01a430a216da
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -0,0 +1,173 @@
+//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities that allow one to create IR moving the data
+// across different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/LoopUtils.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+/// Returns the textual name of a GPU dimension.
+static StringRef getDimName(unsigned dim) {
+  if (dim == 0)
+    return "x";
+  if (dim == 1)
+    return "y";
+  if (dim == 2)
+    return "z";
+
+  llvm_unreachable("dimension ID overflow");
+}
+
+/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
+/// values using the bounds derived from the "from" value. Emits at least
+/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
+/// single-iteration loops. Maps the innermost loops to thread dimensions, in
+/// reverse order to enable access coalescing in the innermost loop.
+static void insertCopyLoops(OpBuilder &builder, Location loc,
+                            edsc::MemRefView &bounds, Value from, Value to) {
+  // Create EDSC handles for bounds.
+  unsigned rank = bounds.rank();
+  SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps;
+
+  // Make sure we have enough loops to use all thread dimensions, these trivial
+  // loops should be outermost and therefore inserted first.
+  if (rank < GPUDialect::getNumWorkgroupDimensions()) {
+    unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
+    edsc::ValueHandle zero = edsc::intrinsics::constant_index(0);
+    edsc::ValueHandle one = edsc::intrinsics::constant_index(1);
+    lbs.resize(extraLoops, zero);
+    ubs.resize(extraLoops, one);
+    steps.resize(extraLoops, one);
+  }
+
+  // Add existing bonuds.
+  lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
+  ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());
+
+  // Emit constant operations for steps.
+  steps.reserve(lbs.size());
+  llvm::transform(
+      bounds.getSteps(), std::back_inserter(steps),
+      [](int64_t step) { return edsc::intrinsics::constant_index(step); });
+
+  // Obtain thread identifiers and block sizes, necessary to map to them.
+  auto indexType = builder.getIndexType();
+  SmallVector<Value, 3> threadIds, blockDims;
+  for (unsigned i = 0; i < 3; ++i) {
+    auto dimName = builder.getStringAttr(getDimName(i));
+    threadIds.push_back(
+        builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
+    blockDims.push_back(
+        builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
+  }
+
+  // Produce the loop nest with copies.
+  auto ivs = edsc::makeIndexHandles(lbs.size());
+  auto ivPtrs =
+      edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs));
+  edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() {
+    auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
+    edsc::StdIndexedValue fromHandle(from), toHandle(to);
+    toHandle(activeIvs) = fromHandle(activeIvs);
+  });
+
+  // Map the innermost loops to threads in reverse order.
+  for (auto en :
+       llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
+           GPUDialect::getNumWorkgroupDimensions())))) {
+    auto loop = cast<loop::ForOp>(
+        en.value().getValue().getParentRegion()->getParentOp());
+    mapLoopToProcessorIds(loop, {threadIds[en.index()]},
+                          {blockDims[en.index()]});
+  }
+}
+
+/// Emits the loop nests performing the copy to the designated location in the
+/// beginning of the region, and from the designated location immediately before
+/// the terminator of the first block of the region. The region is expected to
+/// have one block. This boils down to the following structure
+///
+///   ^bb(...):
+///     <loop-bound-computation>
+///     for %arg0 = ... to ... step ... {
+///       ...
+///         for %argN = <thread-id-x> to ... step <block-dim-x> {
+///           %0 = load %from[%arg0, ..., %argN]
+///           store %0, %to[%arg0, ..., %argN]
+///         }
+///       ...
+///     }
+///     gpu.barrier
+///     <... original body ...>
+///     gpu.barrier
+///     for %arg0 = ... to ... step ... {
+///       ...
+///         for %argN = <thread-id-x> to ... step <block-dim-x> {
+///           %1 = load %to[%arg0, ..., %argN]
+///           store %1, %from[%arg0, ..., %argN]
+///         }
+///       ...
+///     }
+///
+/// Inserts the barriers unconditionally since different threads may be copying
+/// values and reading them. An analysis would be required to eliminate barriers
+/// in case where value is only used by the thread that copies it. Both copies
+/// are inserted unconditionally, an analysis would be required to only copy
+/// live-in and live-out values when necessary. This copies the entire memref
+/// pointed to by "from". In case a smaller block would be sufficient, the
+/// caller can create a subview of the memref and promote it instead.
+static void insertCopies(Region &region, Location loc, Value from, Value to) {
+  auto fromType = from.getType().cast<MemRefType>();
+  auto toType = to.getType().cast<MemRefType>();
+  (void)fromType;
+  (void)toType;
+  assert(fromType.getShape() == toType.getShape());
+  assert(fromType.getRank() != 0);
+  assert(has_single_element(region) &&
+         "unstructured control flow not supported");
+
+  OpBuilder builder(region.getContext());
+  builder.setInsertionPointToStart(&region.front());
+
+  edsc::ScopedContext edscContext(builder, loc);
+  edsc::MemRefView fromView(from);
+  insertCopyLoops(builder, loc, fromView, from, to);
+  builder.create<gpu::BarrierOp>(loc);
+
+  builder.setInsertionPoint(&region.front().back());
+  builder.create<gpu::BarrierOp>(loc);
+  insertCopyLoops(builder, loc, fromView, to, from);
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
+  Value value = op.getArgument(arg);
+  auto type = value.getType().dyn_cast<MemRefType>();
+  assert(type && type.hasStaticShape() && "can only promote memrefs");
+
+  Value attribution =
+      op.addWorkgroupAttribution(type.getShape(), type.getElementType());
+
+  // Replace the uses first since only the original uses are currently present.
+  // Then insert the copies.
+  value.replaceAllUsesWith(attribution);
+  insertCopies(op.getBody(), op.getLoc(), value, attribution);
+}
diff --git a/mlir/lib/IR/Block.cpp b/mlir/lib/IR/Block.cpp
index b0ada9981a8a7..2757c505555a1 100644
--- a/mlir/lib/IR/Block.cpp
+++ b/mlir/lib/IR/Block.cpp
@@ -179,6 +179,20 @@ void Block::eraseArgument(unsigned index, bool updatePredTerms) {
   }
 }
 
+/// Insert one value to the given position of the argument list. The existing
+/// arguments are shifted. The block is expected not to have predecessors.
+BlockArgument Block::insertArgument(args_iterator it, Type type) {
+  assert(llvm::empty(getPredecessors()) &&
+         "cannot insert arguments to blocks with predecessors");
+
+  // Use the args_iterator (on the BlockArgListType) to compute the insertion
+  // iterator in the underlying argument storage.
+  size_t distance = std::distance(args_begin(), it);
+  auto arg = BlockArgument::create(type, this);
+  arguments.insert(std::next(arguments.begin(), distance), arg);
+  return arg;
+}
+
 //===----------------------------------------------------------------------===//
 // Terminator management
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 1725a0b7c75c3..e01a23343652f 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -423,3 +423,104 @@ func @vector_print_vector(%arg0: vector<2x2xf32>) {
 //       CHECK:    llvm.call @print_close() : () -> ()
 //       CHECK:    llvm.call @print_close() : () -> ()
 //       CHECK:    llvm.call @print_newline() : () -> ()
+
+
+func @strided_slice(%arg0: vector<4xf32>, %arg1: vector<4x8xf32>, %arg2: vector<4x8x16xf32>) {
+// CHECK-LABEL: llvm.func @strided_slice(
+  %0 = vector.strided_slice %arg0 {offsets = [2], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
+//       CHECK:    llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
+//       CHECK:    llvm.mlir.constant(dense<0.000000e+00> : vector<2xf32>) : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(2 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement %{{.*}}[%{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//       CHECK:    llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement %{{.*}}, %{{.*}}[%{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(3 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement %{{.*}}[%{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//       CHECK:    llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement %{{.*}}, %{{.*}}[%{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+
+  %1 = vector.strided_slice %arg1 {offsets = [2], sizes = [2], strides = [1]} : vector<4x8xf32> to vector<2x8xf32>
+//       CHECK:    llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
+//       CHECK:    llvm.mlir.constant(dense<0.000000e+00> : vector<2x8xf32>) : !llvm<"[2 x <8 x float>]">
+//       CHECK:    llvm.extractvalue %{{.*}}[2] : !llvm<"[4 x <8 x float>]">
+//       CHECK:    llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm<"[2 x <8 x float>]">
+//       CHECK:    llvm.extractvalue %{{.*}}[3] : !llvm<"[4 x <8 x float>]">
+//       CHECK:    llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"[2 x <8 x float>]">
+
+  %2 = vector.strided_slice %arg1 {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x8xf32> to vector<2x2xf32>
+//       CHECK:    llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
+//       CHECK:    llvm.mlir.constant(dense<0.000000e+00> : vector<2x2xf32>) : !llvm<"[2 x <2 x float>]">
+//
+// Subvector vector<8xf32> @2
+//       CHECK:    llvm.extractvalue {{.*}}[2] : !llvm<"[4 x <8 x float>]">
+//       CHECK:    llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
+//       CHECK:    llvm.mlir.constant(dense<0.000000e+00> : vector<2xf32>) : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(2 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<8 x float>">
+//       CHECK:    llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(3 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<8 x float>">
+//       CHECK:    llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[2 x <2 x float>]">
+//
+// Subvector vector<8xf32> @3
+//       CHECK:    llvm.extractvalue {{.*}}[3] : !llvm<"[4 x <8 x float>]">
+//       CHECK:    llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
+//       CHECK:    llvm.mlir.constant(dense<0.000000e+00> : vector<2xf32>) : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(2 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<8 x float>">
+//       CHECK:    llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(3 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<8 x float>">
+//       CHECK:    llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[2 x <2 x float>]">
+
+  return
+}
+
+func @insert_strided_slice(%a: vector<2x2xf32>, %b: vector<4x4xf32>, %c: vector<4x4x4xf32>) {
+// CHECK-LABEL: @insert_strided_slice
+
+  %0 = vector.insert_strided_slice %b, %c {offsets = [2, 0, 0], strides = [1, 1]} : vector<4x4xf32> into vector<4x4x4xf32>
+//       CHECK:    llvm.extractvalue {{.*}}[2] : !llvm<"[4 x [4 x <4 x float>]]">
+//  CHECK-NEXT:    llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x [4 x <4 x float>]]">
+
+  %1 = vector.insert_strided_slice %a, %b {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+//
+// Subvector vector<2xf32> @0 into vector<4xf32> @2
+//       CHECK:    llvm.extractvalue {{.*}}[0] : !llvm<"[2 x <2 x float>]">
+//  CHECK-NEXT:    llvm.extractvalue {{.*}}[2] : !llvm<"[4 x <4 x float>]">
+// Element @0 -> element @2
+//  CHECK-NEXT:    llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//  CHECK-NEXT:    llvm.mlir.constant(2 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+// Element @1 -> element @3
+//  CHECK-NEXT:    llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//  CHECK-NEXT:    llvm.mlir.constant(3 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//  CHECK-NEXT:    llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x <4 x float>]">
+//
+// Subvector vector<2xf32> @1 into vector<4xf32> @3
+//       CHECK:    llvm.extractvalue {{.*}}[1] : !llvm<"[2 x <2 x float>]">
+//  CHECK-NEXT:    llvm.extractvalue {{.*}}[3] : !llvm<"[4 x <4 x float>]">
+// Element @0 -> element @2
+//  CHECK-NEXT:    llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//  CHECK-NEXT:    llvm.mlir.constant(2 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+// Element @1 -> element @3
+//  CHECK-NEXT:    llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//  CHECK-NEXT:    llvm.mlir.constant(3 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//  CHECK-NEXT:    llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x <4 x float>]">
+
+  return
+}
+
diff --git a/mlir/test/Dialect/GPU/promotion.mlir b/mlir/test/Dialect/GPU/promotion.mlir
new file mode 100644
index 0000000000000..c06174e0fcded
--- /dev/null
+++ b/mlir/test/Dialect/GPU/promotion.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-opt -test-gpu-memory-promotion -split-input-file %s | FileCheck %s
+
+module @foo attributes {gpu.kernel_module} {
+  // Verify that the attribution was indeed introduced
+  // CHECK-LABEL: @memref3d
+  // CHECK-SAME: (%[[arg:.*]]: memref<5x4xf32>
+  // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<5x4xf32, 3>)
+  gpu.func @memref3d(%arg0: memref<5x4xf32> {gpu.test_promote_workgroup}) kernel {
+    // Verify that loop bounds are emitted, the order does not matter.
+    // CHECK-DAG: %[[c1:.*]] = constant 1
+    // CHECK-DAG: %[[c4:.*]] = constant 4
+    // CHECK-DAG: %[[c5:.*]] = constant 5
+    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+    // Verify that loops for the copy are emitted. We only check the number of
+    // loops here since their bounds are produced by mapLoopToProcessorIds,
+    // tested separately.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+
+    // Verify that the copy is emitted and uses only the last two loops.
+    // CHECK:       %[[v:.*]] = load %[[arg]][%[[i1]], %[[i2]]]
+    // CHECK:       store %[[v]], %[[promoted]][%[[i1]], %[[i2]]]
+
+    // Verify that the use has been rewritten.
+    // CHECK: "use"(%[[promoted]]) : (memref<5x4xf32, 3>)
+    "use"(%arg0) : (memref<5x4xf32>) -> ()
+
+
+    // Verify that loops for the copy are emitted. We only check the number of
+    // loops here since their bounds are produced by mapLoopToProcessorIds,
+    // tested separately.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+
+    // Verify that the copy is emitted and uses only the last two loops.
+    // CHECK:       %[[v:.*]] = load %[[promoted]][%[[i1]], %[[i2]]]
+    // CHECK:       store %[[v]], %[[arg]][%[[i1]], %[[i2]]]
+    gpu.return
+  }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+  // Verify that the attribution was indeed introduced
+  // CHECK-LABEL: @memref5d
+  // CHECK-SAME: (%[[arg:.*]]: memref<8x7x6x5x4xf32>
+  // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<8x7x6x5x4xf32, 3>)
+  gpu.func @memref5d(%arg0: memref<8x7x6x5x4xf32> {gpu.test_promote_workgroup}) kernel {
+    // Verify that loop bounds are emitted, the order does not matter.
+    // CHECK-DAG: %[[c0:.*]] = constant 0
+    // CHECK-DAG: %[[c1:.*]] = constant 1
+    // CHECK-DAG: %[[c4:.*]] = constant 4
+    // CHECK-DAG: %[[c5:.*]] = constant 5
+    // CHECK-DAG: %[[c6:.*]] = constant 6
+    // CHECK-DAG: %[[c7:.*]] = constant 7
+    // CHECK-DAG: %[[c8:.*]] = constant 8
+    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+    // Verify that loops for the copy are emitted.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+    // CHECK:       loop.for %[[i3:.*]] =
+    // CHECK:         loop.for %[[i4:.*]] =
+
+    // Verify that the copy is emitted.
+    // CHECK:           %[[v:.*]] = load %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    // CHECK:           store %[[v]], %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+
+    // Verify that the use has been rewritten.
+    // CHECK: "use"(%[[promoted]]) : (memref<8x7x6x5x4xf32, 3>)
+    "use"(%arg0) : (memref<8x7x6x5x4xf32>) -> ()
+
+    // Verify that loop loops for the copy are emitted.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+    // CHECK:       loop.for %[[i3:.*]] =
+    // CHECK:         loop.for %[[i4:.*]] =
+
+    // Verify that the copy is emitted.
+    // CHECK:           %[[v:.*]] = load %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    // CHECK:           store %[[v]], %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    gpu.return
+  }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+  // Check that attribution insertion works fine.
+  // CHECK-LABEL: @insert
+  // CHECK-SAME: (%{{.*}}: memref<4xf32>
+  // CHECK-SAME: workgroup(%{{.*}}: memref<1x1xf64, 3>
+  // CHECK-SAME: %[[wg2:.*]] : memref<4xf32, 3>)
+  // CHECK-SAME: private(%{{.*}}: memref<1x1xi64, 5>)
+  gpu.func @insert(%arg0: memref<4xf32> {gpu.test_promote_workgroup})
+      workgroup(%arg1: memref<1x1xf64, 3>)
+      private(%arg2: memref<1x1xi64, 5>)
+      kernel {
+    // CHECK: "use"(%[[wg2]])
+    "use"(%arg0) : (memref<4xf32>) -> ()
+    gpu.return
+  }
+}
diff --git a/mlir/test/EDSC/builder-api-test.cpp b/mlir/test/EDSC/builder-api-test.cpp
index c776ffe12bdd7..64e1b7094df3c 100644
--- a/mlir/test/EDSC/builder-api-test.cpp
+++ b/mlir/test/EDSC/builder-api-test.cpp
@@ -724,9 +724,10 @@ TEST_FUNC(indirect_access) {
   // clang-format on
 
   // clang-format off
-  // CHECK-LABEL: func @indirect_access(
-  // CHECK:  [[B:%.*]] = affine.load
-  // CHECK:  [[D:%.*]] = affine.load
+  // CHECK-LABEL: func @indirect_access
+  // CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: memref<?xf32>, %[[ARG3:.*]]: memref<?xf32>)
+  // CHECK-DAG:  [[B:%.*]] = affine.load %[[ARG1]]
+  // CHECK-DAG:  [[D:%.*]] = affine.load %[[ARG3]]
   // CHECK:  load %{{.*}}{{\[}}[[B]]{{\]}}
   // CHECK:  store %{{.*}}, %{{.*}}{{\[}}[[D]]{{\]}}
   // clang-format on
diff --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir
index a6c2326290752..16ee00923d258 100644
--- a/mlir/test/Transforms/canonicalize.mlir
+++ b/mlir/test/Transforms/canonicalize.mlir
@@ -703,28 +703,28 @@ func @view(%arg0 : index) {
   %c15 = constant 15 : index
 
   // Test: fold constant sizes and offset, update map with static stride/offset.
-  // CHECK: std.view %0[][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP0]]>
+  // CHECK: std.view %[[ALLOC_MEM]][][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP0]]>
   %1 = view %0[%c15][%c7, %c11]
     : memref<2048xi8> to memref<?x?xf32, #TEST_VIEW_MAP0>
   load %1[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
 
   // Test: fold constant sizes but not offset, update map with static stride.
   // Test that we do not a fold dynamic dim which is not produced by a constant.
-  // CHECK: std.view %0[%arg0][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP1]]>
+  // CHECK: std.view %[[ALLOC_MEM]][%arg0][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP1]]>
   %2 = view %0[%arg0][%c7, %c11]
     : memref<2048xi8> to memref<?x?xf32, #TEST_VIEW_MAP0>
   load %2[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
 
   // Test: fold constant offset but not sizes, update map with constant offset.
   // Test that we fold constant offset but not dynamic dims.
-  // CHECK: std.view %0[][%arg0, %arg0] : memref<2048xi8> to memref<?x?xf32, #[[VIEW_MAP2]]>
+  // CHECK: std.view %[[ALLOC_MEM]][][%arg0, %arg0] : memref<2048xi8> to memref<?x?xf32, #[[VIEW_MAP2]]>
   %3 = view %0[%c15][%arg0, %arg0]
     : memref<2048xi8> to memref<?x?xf32,  #TEST_VIEW_MAP0>
   load %3[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
 
   // Test: fold one constant dim, no offset, should update with constant
   // stride on dim 1, but leave dynamic stride on dim 0.
-  // CHECK: std.view %0[][%arg0, %arg0] : memref<2048xi8> to memref<?x?x7xf32, #[[VIEW_MAP3]]>
+  // CHECK: std.view %[[ALLOC_MEM]][][%arg0, %arg0] : memref<2048xi8> to memref<?x?x7xf32, #[[VIEW_MAP3]]>
   %4 = view %0[][%arg0, %arg0, %c7]
     : memref<2048xi8> to memref<?x?x?xf32, #TEST_VIEW_MAP1>
   load %4[%c0, %c0, %c0] : memref<?x?x?xf32, #TEST_VIEW_MAP1>
@@ -736,7 +736,7 @@ func @view(%arg0 : index) {
   load %5[%c0, %c0] : memref<?x4xf32, #TEST_VIEW_MAP2>
 
   // Test: folding static alloc and memref_cast into a view.
-  // CHECK: std.view %0[][%c15, %c7] : memref<2048xi8> to memref<?x?xf32>
+  // CHECK: std.view %[[ALLOC_MEM]][][%c15, %c7] : memref<2048xi8> to memref<?x?xf32>
   %6 = memref_cast %0 : memref<2048xi8> to memref<?xi8>
   %7 = view %6[%c15][%c7] : memref<?xi8> to memref<?x?xf32>
   load %7[%c0, %c0] : memref<?x?xf32>
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index b6338e1d167cd..ac4a4930e5a51 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(MLIRTestTransforms
   TestCallGraph.cpp
   TestConstantFold.cpp
   TestLoopFusion.cpp
+  TestGpuMemoryPromotion.cpp
   TestInlining.cpp
   TestLinalgTransforms.cpp
   TestLiveness.cpp
@@ -26,6 +27,8 @@ add_dependencies(MLIRTestTransforms MLIRTestVectorTransformPatternsIncGen)
 target_link_libraries(MLIRTestTransforms
   MLIRAffineOps
   MLIRAnalysis
+  MLIREDSC
+  MLIRGPU
   MLIRLoopOps
   MLIRPass
   MLIRTestDialect
diff --git a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
new file mode 100644
index 0000000000000..ee0291827fa45
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
@@ -0,0 +1,40 @@
+//===- TestGPUMemoryPromotionPass.cpp - Test pass for GPU promotion -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass testing the utilities for moving data across
+// different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple pass for testing the promotion to workgroup memory in GPU functions.
+/// Promotes all arguments with "gpu.test_promote_workgroup" attribute. This
+/// does not check whether the promotion is legal (e.g., amount of memory used)
+/// or beneficial (e.g., makes previously uncoalesced loads coalesced).
+class TestGpuMemoryPromotionPass
+    : public OperationPass<TestGpuMemoryPromotionPass, gpu::GPUFuncOp> {
+  void runOnOperation() override {
+    gpu::GPUFuncOp op = getOperation();
+    for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
+      if (op.getArgAttrOfType<UnitAttr>(i, "gpu.test_promote_workgroup"))
+        promoteToWorkgroupMemory(op, i);
+    }
+  }
+};
+} // end namespace
+
+static PassRegistration<TestGpuMemoryPromotionPass> registration(
+    "test-gpu-memory-promotion",
+    "Promotes the annotated arguments of gpu.func to workgroup memory.");