diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
index 5524c4b484be1..67d0931003c54 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
@@ -301,14 +301,20 @@ void UnsafeFunctionsCheck::check(const MatchFinder::MatchResult &Result) {
   if (Custom) {
     for (const auto &Entry : CustomFunctions) {
       if (Entry.Pattern.match(*FuncDecl)) {
-        const StringRef Reason =
+        StringRef Reason =
             Entry.Reason.empty() ? "is marked as unsafe" : Entry.Reason.c_str();
 
-        if (Entry.Replacement.empty()) {
+        // Omit the replacement, when a fully-custom reason is given.
+        if (Reason.consume_front(">")) {
+          diag(SourceExpr->getExprLoc(), "function %0 %1")
+              << FuncDecl << Reason.trim() << SourceExpr->getSourceRange();
+          // Do not recommend a replacement when it is not present.
+        } else if (Entry.Replacement.empty()) {
           diag(SourceExpr->getExprLoc(),
                "function %0 %1; it should not be used")
               << FuncDecl << Reason << Entry.Replacement
               << SourceExpr->getSourceRange();
+          // Otherwise, emit the replacement.
         } else {
           diag(SourceExpr->getExprLoc(),
                "function %0 %1; '%2' should be used instead")
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index b982216297919..743397e3ec6ce 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -69,6 +69,13 @@ Potentially Breaking Changes
   - `CharTypdefsToIgnore` to `CharTypedefsToIgnore` in
     :doc:`bugprone-signed-char-misuse
     <clang-tidy/checks/bugprone/signed-char-misuse>`
+  
+- Modified the custom message format of :doc:`bugprone-unsafe-functions
+  <clang-tidy/checks/bugprone/unsafe-functions>` by assigning a special meaning
+  to the character ``>`` at the start of the value of the option
+  ``CustomFunctions``. If the option value starts with ``>``, then the
+  replacement suggestion part of the message (which would be included by
+  default) is omitted. (This does not change the warning locations.)
 
 - :program:`clang-tidy` now displays warnings from all non-system headers by
   default. Previously, users had to explicitly opt-in to header warnings using
@@ -387,6 +394,11 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/unhandled-self-assignment>` check by adding
   an additional matcher that generalizes the copy-and-swap idiom pattern
   detection.
+  
+- Improved :doc:`bugprone-unsafe-functions
+  <clang-tidy/checks/bugprone/unsafe-functions>` check by hiding the default
+  suffix when the reason starts with the character `>` in the `CustomFunctions`
+  option.
 
 - Improved :doc:`cppcoreguidelines-avoid-non-const-global-variables
   <clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables>` check
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
index f1fec13739271..cb7ea415c54b2 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
@@ -96,37 +96,62 @@ to be checked. The format is the following, without newlines:
 The functions are matched using POSIX extended regular expressions.
 *(Note: The regular expressions do not support negative* ``(?!)`` *matches.)*
 
-The `reason` is optional and is used to provide additional information
-about the reasoning behind the replacement. The default reason is
-`is marked as unsafe`.
+The ``reason`` is optional and is used to provide additional information about the
+reasoning behind the replacement. The default reason is ``is marked as unsafe``.
 
-If `replacement` is empty, the text `it should not be used` will be shown
-instead of the suggestion for a replacement.
+If ``replacement`` is empty, the default text ``it should not be used`` will be
+shown instead of the suggestion for a replacement.
 
-As an example, the configuration `^original$, replacement, is deprecated;`
-will produce the following diagnostic message.
+If the ``reason`` starts with the character ``>``, the reason becomes fully custom.
+The default suffix is disabled even if a ``replacement`` is present, and only the
+reason message is shown after the matched function, to allow better control over
+the suggestions. (The starting ``>`` and whitespace directly after it are
+trimmed from the message.)
+
+As an example, the following configuration matches only the function ``original``
+in the default namespace. A similar diagnostic can also be printed using a fully
+custom reason.
 
 .. code:: c
 
+   // bugprone-unsafe-functions.CustomFunctions:
+   //   ^original$, replacement, is deprecated;
+   // Using the fully custom message syntax:
+   //   ^suspicious$,,> should be avoided if possible.
    original(); // warning: function 'original' is deprecated; 'replacement' should be used instead.
+   suspicious(); // warning: function 'suspicious' should be avoided if possible.
    ::std::original(); // no-warning
    original_function(); // no-warning
 
-If the regular expression contains the character `:`, it is matched against the
-qualified name (i.e. ``std::original``), otherwise the regex is matched against the unqualified name (``original``).
-If the regular expression starts with `::` (or `^::`), it is matched against the
-fully qualified name (``::std::original``).
+If the regular expression contains the character ``:``, it is matched against the
+qualified name (i.e. ``std::original``), otherwise the regex is matched against
+the unqualified name (``original``). If the regular expression starts with ``::``
+(or ``^::``), it is matched against the fully qualified name
+(``::std::original``).
+
+One of the use cases for fully custom messages is suggesting compiler options
+and warning flags:
+
+.. code:: c
+
+   // bugprone-unsafe-functions.CustomFunctions:
+   //   ^memcpy$,,>is recommended to have compiler hardening using '_FORTIFY_SOURCE';
+   //   ^printf$,,>is recommended to have the '-Werror=format-security' compiler warning flag;
+
+   memcpy(dest, src, 999'999); // warning: function 'memcpy' is recommended to have compiler hardening using '_FORTIFY_SOURCE'
+   printf(raw_str); // warning: function 'printf' is recommended to have the '-Werror=format-security' compiler warning flag
 
 .. note::
 
-   Fully qualified names can contain template parameters on certain C++ classes, but not on C++ functions.
-   Type aliases are resolved before matching.
+   Fully qualified names can contain template parameters on certain C++ classes,
+   but not on C++ functions. Type aliases are resolved before matching.
 
    As an example, the member function ``open`` in the class ``std::ifstream``
    has a fully qualified name of ``::std::basic_ifstream<char>::open``.
 
-   The example could also be matched with the regex ``::std::basic_ifstream<[^>]*>::open``, which matches all potential
-   template parameters, but does not match nested template classes.
+   The example could also be matched with the regex
+   ``::std::basic_ifstream<[^>]*>::open``, which matches all potential template
+   parameters, but does not match nested template classes.
 
 Options
 -------
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom.c
index 7fd71ec2f2e7b..7eaf015f06aa2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom.c
@@ -1,5 +1,5 @@
 // RUN: %check_clang_tidy -check-suffix=NON-STRICT-REGEX %s bugprone-unsafe-functions %t --\
-// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '::name_match,replacement,is a qualname match;^::prefix_match,,is matched on qualname prefix'}}"
+// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: \"::name_match,,>is a qualname match, but with a fully 'custom' message;^::prefix_match,,is matched on qualname prefix\"}}"
 // RUN: %check_clang_tidy -check-suffix=STRICT-REGEX     %s bugprone-unsafe-functions %t --\
 // RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '^name_match$,replacement,is matched on function name only;^::prefix_match$,,is a full qualname match'}}"
 
@@ -11,14 +11,14 @@ void prefix_match_regex();
 
 void f1() {
   name_match();
-  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match' is a qualname match; 'replacement' should be used instead
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match' is a qualname match, but with a fully 'custom' message
   // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'name_match' is matched on function name only; 'replacement' should be used instead
   prefix_match();
   // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'prefix_match' is matched on qualname prefix; it should not be used
   // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'prefix_match' is a full qualname match; it should not be used
 
   name_match_regex();
-  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match_regex' is a qualname match; 'replacement' should be used instead
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match_regex' is a qualname match, but with a fully 'custom' message
   // no-warning STRICT-REGEX
 
   prefix_match_regex();
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0a79b5ca73472..f2d8eb43b15a7 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -743,6 +743,7 @@ Bug Fixes to Attribute Support
 - Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905)
 - Fix handling of parameter indexes when an attribute is applied to a C++23 explicit object member function.
 - Fixed several false positives and false negatives in function effect (`nonblocking`) analysis. (#GH166078) (#GH166101) (#GH166110)
+- Fix ``cleanup`` attribute by delaying type checks until after the type is deduced. (#GH129631)
 
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 8dfe4bc08c48e..0097476bc0d8d 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -741,6 +741,17 @@ class Attr {
   // our existing general parsing we need to have a separate flag that
   // opts an attribute into strict parsing of attribute parameters
   bit StrictEnumParameters = 0;
+  // Set to true for attributes which have Sema checks which requires the type
+  // to be deduced.
+  // When `IsTypeDependent` is set to true, you should add an `ActOn*Attr`
+  // function to `Sema.h`. The signature of the function must be:
+  // `void ActOn*Attr(Decl *, const Attr *);` where the `Decl *` is the
+  // declaration the attribute will be attached to; its type will have already
+  // been deduced, and the `Attr *` is the attribute being applied to that
+  // declaration. This function should handle all type-sensitive semantics for
+  // the attribute. This function will be automatically called by
+  // `Sema::CheckAttributesOnDeducedType()`.
+  bit IsTypeDependent = 0;
   // Lists language options, one of which is required to be true for the
   // attribute to be applicable. If empty, no language options are required.
   list<LangOpt> LangOpts = [];
@@ -1400,6 +1411,7 @@ def Cleanup : InheritableAttr {
   let Args = [DeclArgument<Function, "FunctionDecl">];
   let Subjects = SubjectList<[LocalVar]>;
   let Documentation = [CleanupDocs];
+  let IsTypeDependent = 1;
   // FIXME: DeclArgument should be reworked to also store the
   // Expr instead of adding attr specific hacks like the following.
   // See the discussion in https://github.com/llvm/llvm-project/pull/14023.
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index dbf857afa08c8..47da17e5cfe83 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5253,6 +5253,18 @@ def HLSLF16ToF32 : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLDdxCoarse : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_ddx_coarse"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HLSLDdyCoarse : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_ddy_coarse"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Sema/CMakeLists.txt b/clang/include/clang/Sema/CMakeLists.txt
index 9077e22c2307c..3f540ea596871 100644
--- a/clang/include/clang/Sema/CMakeLists.txt
+++ b/clang/include/clang/Sema/CMakeLists.txt
@@ -8,6 +8,11 @@ clang_tablegen(AttrParsedAttrKinds.inc -gen-clang-attr-parsed-attr-kinds
   SOURCE ../Basic/Attr.td
   TARGET ClangAttrParsedAttrKinds)
 
+clang_tablegen(AttrIsTypeDependent.inc -gen-clang-attr-is-type-dependent
+  -I ${CMAKE_CURRENT_SOURCE_DIR}/../../
+  SOURCE ../Basic/Attr.td
+  TARGET ClangAttrIsTypeDependent)
+
 clang_tablegen(AttrSpellingListIndex.inc -gen-clang-attr-spelling-index
   -I ${CMAKE_CURRENT_SOURCE_DIR}/../../
   SOURCE ../Basic/Attr.td
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 6ca182338d6af..fd2a2469142e4 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -4456,6 +4456,10 @@ class Sema final : public SemaBase {
       NamedDecl *New, Decl *Old,
       AvailabilityMergeKind AMK = AvailabilityMergeKind::Redeclaration);
 
+  /// CheckAttributesOnDeducedType - Calls Sema functions for attributes that
+  /// requires the type to be deduced.
+  void CheckAttributesOnDeducedType(Decl *D);
+
   /// MergeTypedefNameDecl - We just parsed a typedef 'New' which has the
   /// same name and scope as a previous declaration 'Old'.  Figure out
   /// how to resolve this situation, merging decls or emitting
@@ -4760,6 +4764,8 @@ class Sema final : public SemaBase {
   // linkage or not.
   static bool mightHaveNonExternalLinkage(const DeclaratorDecl *FD);
 
+#include "clang/Sema/AttrIsTypeDependent.inc"
+
   ///@}
 
   //
@@ -15469,6 +15475,8 @@ class Sema final : public SemaBase {
   std::optional<FunctionEffectMode>
   ActOnEffectExpression(Expr *CondExpr, StringRef AttributeName);
 
+  void ActOnCleanupAttr(Decl *D, const Attr *A);
+
 private:
   /// The implementation of RequireCompleteType
   bool RequireCompleteTypeImpl(SourceLocation Loc, QualType T,
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index b6928ce7d9c44..12d9a98915ce3 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -924,6 +924,24 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     return EmitRuntimeCall(
         Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
   }
+  case Builtin::BI__builtin_hlsl_elementwise_ddx_coarse: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    if (!E->getArg(0)->getType()->hasFloatingRepresentation())
+      llvm_unreachable("ddx_coarse operand must have a float representation");
+    Intrinsic::ID ID = CGM.getHLSLRuntime().getDdxCoarseIntrinsic();
+    return Builder.CreateIntrinsic(/*ReturnType=*/Op0->getType(), ID,
+                                   ArrayRef<Value *>{Op0}, nullptr,
+                                   "hlsl.ddx.coarse");
+  }
+  case Builtin::BI__builtin_hlsl_elementwise_ddy_coarse: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    if (!E->getArg(0)->getType()->hasFloatingRepresentation())
+      llvm_unreachable("ddy_coarse operand must have a float representation");
+    Intrinsic::ID ID = CGM.getHLSLRuntime().getDdyCoarseIntrinsic();
+    return Builder.CreateIntrinsic(/*ReturnType=*/Op0->getType(), ID,
+                                   ArrayRef<Value *>{Op0}, nullptr,
+                                   "hlsl.ddy.coarse");
+  }
   case Builtin::BI__builtin_get_spirv_spec_constant_bool:
   case Builtin::BI__builtin_get_spirv_spec_constant_short:
   case Builtin::BI__builtin_get_spirv_spec_constant_ushort:
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 48935584f28a2..e1200c62eccf1 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -163,6 +163,8 @@ class CGHLSLRuntime {
   GENERATE_HLSL_INTRINSIC_FUNCTION(GroupMemoryBarrierWithGroupSync,
                                    group_memory_barrier_with_group_sync)
   GENERATE_HLSL_INTRINSIC_FUNCTION(GetDimensionsX, resource_getdimensions_x)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(DdxCoarse, ddx_coarse)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(DdyCoarse, ddy_coarse)
 
   //===----------------------------------------------------------------------===//
   // End of reserved area for HLSL intrinsic getters.
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index e30e56d66c58b..29c9e2125b653 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -72,6 +72,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
@@ -105,6 +106,7 @@
 #include <memory>
 #include <optional>
 #include <set>
+#include <string>
 #include <utility>
 #if LLVM_ON_UNIX
 #include <unistd.h> // getpid
@@ -2060,12 +2062,17 @@ void Driver::generateCompilationDiagnostics(
   InputList Inputs;
   BuildInputs(C.getDefaultToolChain(), C.getArgs(), Inputs);
 
+  ArgStringList IRInputs;
   for (InputList::iterator it = Inputs.begin(), ie = Inputs.end(); it != ie;) {
     bool IgnoreInput = false;
 
-    // Ignore input from stdin or any inputs that cannot be preprocessed.
-    // Check type first as not all linker inputs have a value.
-    if (types::getPreprocessedType(it->first) == types::TY_INVALID) {
+    // Save IR inputs separately, ignore input from stdin or any other inputs
+    // that cannot be preprocessed. Check type first as not all linker inputs
+    // have a value.
+    if (types::isLLVMIR(it->first)) {
+      IRInputs.push_back(it->second->getValue());
+      IgnoreInput = true;
+    } else if (types::getPreprocessedType(it->first) == types::TY_INVALID) {
       IgnoreInput = true;
     } else if (!strcmp(it->second->getValue(), "-")) {
       Diag(clang::diag::note_drv_command_failed_diag_msg)
@@ -2082,7 +2089,7 @@ void Driver::generateCompilationDiagnostics(
     }
   }
 
-  if (Inputs.empty()) {
+  if (Inputs.empty() && IRInputs.empty()) {
     Diag(clang::diag::note_drv_command_failed_diag_msg)
         << "Error generating preprocessed source(s) - "
            "no preprocessable inputs.";
@@ -2105,46 +2112,82 @@ void Driver::generateCompilationDiagnostics(
     return;
   }
 
-  // Construct the list of abstract actions to perform for this compilation. On
-  // Darwin OSes this uses the driver-driver and builds universal actions.
-  const ToolChain &TC = C.getDefaultToolChain();
-  if (TC.getTriple().isOSBinFormatMachO())
-    BuildUniversalActions(C, TC, Inputs);
-  else
-    BuildActions(C, C.getArgs(), Inputs, C.getActions());
+  // If we only have IR inputs there's no need for preprocessing.
+  if (!Inputs.empty()) {
+    // Construct the list of abstract actions to perform for this compilation.
+    // On Darwin OSes this uses the driver-driver and builds universal actions.
+    const ToolChain &TC = C.getDefaultToolChain();
+    if (TC.getTriple().isOSBinFormatMachO())
+      BuildUniversalActions(C, TC, Inputs);
+    else
+      BuildActions(C, C.getArgs(), Inputs, C.getActions());
 
-  BuildJobs(C);
+    BuildJobs(C);
 
-  // If there were errors building the compilation, quit now.
-  if (Trap.hasErrorOccurred()) {
-    Diag(clang::diag::note_drv_command_failed_diag_msg)
-        << "Error generating preprocessed source(s).";
-    return;
-  }
+    // If there were errors building the compilation, quit now.
+    if (Trap.hasErrorOccurred()) {
+      Diag(clang::diag::note_drv_command_failed_diag_msg)
+          << "Error generating preprocessed source(s).";
+      return;
+    }
+    // Generate preprocessed output.
+    SmallVector<std::pair<int, const Command *>, 4> FailingCommands;
+    C.ExecuteJobs(C.getJobs(), FailingCommands);
 
-  // Generate preprocessed output.
-  SmallVector<std::pair<int, const Command *>, 4> FailingCommands;
-  C.ExecuteJobs(C.getJobs(), FailingCommands);
+    // If any of the preprocessing commands failed, clean up and exit.
+    if (!FailingCommands.empty()) {
+      Diag(clang::diag::note_drv_command_failed_diag_msg)
+          << "Error generating preprocessed source(s).";
+      return;
+    }
 
-  // If any of the preprocessing commands failed, clean up and exit.
-  if (!FailingCommands.empty()) {
-    Diag(clang::diag::note_drv_command_failed_diag_msg)
-        << "Error generating preprocessed source(s).";
-    return;
+    const ArgStringList &TempFiles = C.getTempFiles();
+    if (TempFiles.empty()) {
+      Diag(clang::diag::note_drv_command_failed_diag_msg)
+          << "Error generating preprocessed source(s).";
+      return;
+    }
   }
 
-  const ArgStringList &TempFiles = C.getTempFiles();
-  if (TempFiles.empty()) {
-    Diag(clang::diag::note_drv_command_failed_diag_msg)
-        << "Error generating preprocessed source(s).";
-    return;
+  // Copying filenames due to ownership.
+  const ArgStringList &Files = C.getTempFiles();
+  SmallVector<std::string> TempFiles(Files.begin(), Files.end());
+
+  // We'd like to copy the IR input file into our own temp file
+  // because the build system might try to clean-up after itself.
+  for (auto const *Input : IRInputs) {
+    int FD;
+    llvm::SmallVector<char, 64> Path;
+
+    StringRef extension = llvm::sys::path::extension(Input);
+    if (!extension.empty())
+      extension = extension.drop_front();
+
+    std::error_code EC = llvm::sys::fs::createTemporaryFile(
+        llvm::sys::path::stem(Input), extension, FD, Path);
+    if (EC) {
+      Diag(clang::diag::note_drv_command_failed_diag_msg)
+          << "Error generating run script: " << "Failed copying IR input files"
+          << " " << EC.message();
+      return;
+    }
+
+    EC = llvm::sys::fs::copy_file(Input, FD);
+    if (EC) {
+      Diag(clang::diag::note_drv_command_failed_diag_msg)
+          << "Error generating run script: " << "Failed copying IR input files"
+          << " " << EC.message();
+      return;
+    }
+
+    TempFiles.push_back(std::string(Path.begin(), Path.end()));
   }
 
   Diag(clang::diag::note_drv_command_failed_diag_msg) << BugReporMsg;
 
   SmallString<128> VFS;
   SmallString<128> ReproCrashFilename;
-  for (const char *TempFile : TempFiles) {
+  for (std::string &TempFile : TempFiles) {
     Diag(clang::diag::note_drv_command_failed_diag_msg) << TempFile;
     if (Report)
       Report->TemporaryFiles.push_back(TempFile);
@@ -2161,7 +2204,7 @@ void Driver::generateCompilationDiagnostics(
   }
 
   for (const char *TempFile : SavedTemps)
-    C.addTempFile(TempFile);
+    TempFiles.push_back(TempFile);
 
   // Assume associated files are based off of the first temporary file.
   CrashReportInfo CrashInfo(TempFiles[0], VFS);
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 5ee53c01d6e30..9871cf4449ec6 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1646,6 +1646,8 @@ SanitizerMask ToolChain::getSupportedSanitizers() const {
     Res |= SanitizerKind::ShadowCallStack;
   if (getTriple().isAArch64(64))
     Res |= SanitizerKind::MemTag;
+  if (getTriple().isBPF())
+    Res |= SanitizerKind::KernelAddress;
   return Res;
 }
 
diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
index 2e2703de18cb1..38b95ee90736a 100644
--- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
@@ -2946,5 +2946,73 @@ float4 radians(float4);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_group_memory_barrier_with_group_sync)
 __attribute__((convergent)) void GroupMemoryBarrierWithGroupSync(void);
 
+//===----------------------------------------------------------------------===//
+// ddx_coarse builtin
+//===----------------------------------------------------------------------===//
+
+/// \fn T ddx_coarse(T value)
+/// \brief Computes a low precision partial derivative with respect to the
+/// screen-space x-coordinate.
+/// \param value The input value.
+///
+/// The return value is a floating point scalar or vector containing the low
+/// prevision partial derivative of the input value.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+half ddx_coarse(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+half2 ddx_coarse(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+half3 ddx_coarse(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+half4 ddx_coarse(half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+float ddx_coarse(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+float2 ddx_coarse(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+float3 ddx_coarse(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+float4 ddx_coarse(float4);
+
+//===----------------------------------------------------------------------===//
+// ddy_coarse builtin
+//===----------------------------------------------------------------------===//
+
+/// \fn T ddy_coarse(T value)
+/// \brief Computes a low precision partial derivative with respect to the
+/// screen-space y-coordinate.
+/// \param value The input value.
+///
+/// The return value is a floating point scalar or vector containing the low
+/// prevision partial derivative of the input value.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+half ddy_coarse(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+half2 ddy_coarse(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+half3 ddy_coarse(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+half4 ddy_coarse(half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+float ddy_coarse(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+float2 ddy_coarse(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+float3 ddy_coarse(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+float4 ddy_coarse(float4);
+
 } // namespace hlsl
 #endif //_HLSL_HLSL_ALIAS_INTRINSICS_H_
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 0ab408cd060c8..4b4ce13806873 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3355,6 +3355,11 @@ void Sema::mergeDeclAttributes(NamedDecl *New, Decl *Old,
   if (!foundAny) New->dropAttrs();
 }
 
+void Sema::CheckAttributesOnDeducedType(Decl *D) {
+  for (const Attr *A : D->attrs())
+    checkAttrIsTypeDependent(D, A);
+}
+
 // Returns the number of added attributes.
 template <class T>
 static unsigned propagateAttribute(ParmVarDecl *To, const ParmVarDecl *From,
@@ -13829,6 +13834,8 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
       return;
   }
 
+  this->CheckAttributesOnDeducedType(RealDecl);
+
   // dllimport cannot be used on variable definitions.
   if (VDecl->hasAttr<DLLImportAttr>() && !VDecl->isStaticDataMember()) {
     Diag(VDecl->getLocation(), diag::err_attribute_dllimport_data_definition);
@@ -14327,6 +14334,8 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
         DeduceVariableDeclarationType(Var, false, nullptr))
       return;
 
+    this->CheckAttributesOnDeducedType(RealDecl);
+
     // C++11 [class.static.data]p3: A static data member can be declared with
     // the constexpr specifier; if so, its declaration shall specify
     // a brace-or-equal-initializer.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index a9e7b44ac9d73..bda7aa32a9348 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3511,16 +3511,6 @@ static void handleCleanupAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     return;
   }
 
-  // We're currently more strict than GCC about what function types we accept.
-  // If this ever proves to be a problem it should be easy to fix.
-  QualType Ty = S.Context.getPointerType(cast<VarDecl>(D)->getType());
-  QualType ParamTy = FD->getParamDecl(0)->getType();
-  if (!S.IsAssignConvertCompatible(S.CheckAssignmentConstraints(
-          FD->getParamDecl(0)->getLocation(), ParamTy, Ty))) {
-    S.Diag(Loc, diag::err_attribute_cleanup_func_arg_incompatible_type)
-      << NI.getName() << ParamTy << Ty;
-    return;
-  }
   VarDecl *VD = cast<VarDecl>(D);
   // Create a reference to the variable declaration. This is a fake/dummy
   // reference.
@@ -8311,3 +8301,28 @@ void Sema::redelayDiagnostics(DelayedDiagnosticPool &pool) {
   assert(curPool && "re-emitting in undelayed context not supported");
   curPool->steal(pool);
 }
+
+void Sema::ActOnCleanupAttr(Decl *D, const Attr *A) {
+  VarDecl *VD = cast<VarDecl>(D);
+  if (VD->getType()->isDependentType())
+    return;
+
+  // Obtains the FunctionDecl that was found when handling the attribute
+  // earlier.
+  CleanupAttr *Attr = D->getAttr<CleanupAttr>();
+  FunctionDecl *FD = Attr->getFunctionDecl();
+  DeclarationNameInfo NI = FD->getNameInfo();
+
+  // We're currently more strict than GCC about what function types we accept.
+  // If this ever proves to be a problem it should be easy to fix.
+  QualType Ty = this->Context.getPointerType(VD->getType());
+  QualType ParamTy = FD->getParamDecl(0)->getType();
+  if (!this->IsAssignConvertCompatible(this->CheckAssignmentConstraints(
+          FD->getParamDecl(0)->getLocation(), ParamTy, Ty))) {
+    this->Diag(Attr->getArgLoc(),
+               diag::err_attribute_cleanup_func_arg_incompatible_type)
+        << NI.getName() << ParamTy << Ty;
+    D->dropAttr<CleanupAttr>();
+    return;
+  }
+}
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 2b9b3abbd5360..5555916c2536f 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3239,7 +3239,9 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   case Builtin::BI__builtin_hlsl_elementwise_degrees:
   case Builtin::BI__builtin_hlsl_elementwise_radians:
   case Builtin::BI__builtin_hlsl_elementwise_rsqrt:
-  case Builtin::BI__builtin_hlsl_elementwise_frac: {
+  case Builtin::BI__builtin_hlsl_elementwise_frac:
+  case Builtin::BI__builtin_hlsl_elementwise_ddx_coarse:
+  case Builtin::BI__builtin_hlsl_elementwise_ddy_coarse: {
     if (SemaRef.checkArgCount(TheCall, 1))
       return true;
     if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall,
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 1b6b559c1227b..3a4b2ccc74350 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1007,6 +1007,15 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
       continue;
     }
 
+    if (auto *A = dyn_cast<CleanupAttr>(TmplAttr)) {
+      if (!New->hasAttr<CleanupAttr>()) {
+        auto *NewAttr = A->clone(Context);
+        NewAttr->setArgLoc(A->getArgLoc());
+        New->addAttr(NewAttr);
+      }
+      continue;
+    }
+
     assert(!TmplAttr->isPackExpansion());
     if (TmplAttr->isLateParsed() && LateAttrs) {
       // Late parsed attributes must be instantiated and attached after the
diff --git a/clang/test/CIR/CodeGen/call.c b/clang/test/CIR/CodeGen/call.c
index d780e37f3d153..99ae4506b1f16 100644
--- a/clang/test/CIR/CodeGen/call.c
+++ b/clang/test/CIR/CodeGen/call.c
@@ -130,7 +130,7 @@ int f12(void) {
 // OGCG:         %{{.+}} = call i32 @f10(i32 noundef 1) #[[ATTR0:.+]]
 // OGCG-NEXT:    %{{.+}} = call i32 @f11(i32 noundef 2) #[[ATTR1:.+]]
 
-// LLVM: attributes #[[ATTR0]] = { nounwind willreturn memory(read, errnomem: none) }
+// LLVM: attributes #[[ATTR0]] = { nounwind willreturn memory(read, errnomem: none, target_mem0: none, target_mem1: none) }
 // LLVM: attributes #[[ATTR1]] = { nounwind willreturn memory(none) }
 
 // OGCG: attributes #[[ATTR0]] = { nounwind willreturn memory(read) }
diff --git a/clang/test/CodeGenHLSL/builtins/ddx-coarse-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/ddx-coarse-builtin.hlsl
new file mode 100644
index 0000000000000..01216eefadba2
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/ddx-coarse-builtin.hlsl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple spirv-pc-vulkan-compute  %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK-SPIRV
+
+// CHECK-LABEL: half @_Z19test_f16_ddx_coarseDh
+// CHECK: %hlsl.ddx.coarse = call {{.*}} half @llvm.dx.ddx.coarse.f16(half %{{.*}})
+// CHECK: ret half %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: half @_Z19test_f16_ddx_coarseDh
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} half @llvm.spv.ddx.coarse.f16(half %{{.*}})
+// CHECK-SPIRV: ret half %hlsl.ddx.coarse
+half test_f16_ddx_coarse(half val) {
+    return __builtin_hlsl_elementwise_ddx_coarse(val);
+}
+
+// CHECK-LABEL: float @_Z19test_f32_ddx_coarsef
+// CHECK: %hlsl.ddx.coarse = call {{.*}} float @llvm.dx.ddx.coarse.f32(float %{{.*}})
+// CHECK: ret float %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: float @_Z19test_f32_ddx_coarsef
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} float @llvm.spv.ddx.coarse.f32(float %{{.*}})
+// CHECK-SPIRV: ret float %hlsl.ddx.coarse
+float test_f32_ddx_coarse(float val) {
+    return __builtin_hlsl_elementwise_ddx_coarse(val);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/ddx-coarse.hlsl b/clang/test/CodeGenHLSL/builtins/ddx-coarse.hlsl
new file mode 100644
index 0000000000000..c200d4715629e
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/ddx-coarse.hlsl
@@ -0,0 +1,86 @@
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple spirv-pc-vulkan-compute  %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK-SPIRV
+
+// CHECK-LABEL: half @_Z19test_f16_ddx_coarseDh
+// CHECK: %hlsl.ddx.coarse = call {{.*}} half @llvm.dx.ddx.coarse.f16(half %{{.*}})
+// CHECK: ret half %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: half @_Z19test_f16_ddx_coarseDh
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} half @llvm.spv.ddx.coarse.f16(half %{{.*}})
+// CHECK-SPIRV: ret half %hlsl.ddx.coarse
+half test_f16_ddx_coarse(half val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <2 x half> @_Z20test_f16_ddx_coarse2Dv2_Dh
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <2 x half> @llvm.dx.ddx.coarse.v2f16(<2 x half> %{{.*}})
+// CHECK: ret <2 x half> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <2 x half> @_Z20test_f16_ddx_coarse2Dv2_Dh
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <2 x half> @llvm.spv.ddx.coarse.v2f16(<2 x half> %{{.*}})
+// CHECK-SPIRV: ret <2 x half> %hlsl.ddx.coarse
+half2 test_f16_ddx_coarse2(half2 val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <3 x half> @_Z20test_f16_ddx_coarse3Dv3_Dh
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <3 x half> @llvm.dx.ddx.coarse.v3f16(<3 x half> %{{.*}})
+// CHECK: ret <3 x half> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <3 x half> @_Z20test_f16_ddx_coarse3Dv3_Dh
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <3 x half> @llvm.spv.ddx.coarse.v3f16(<3 x half> %{{.*}})
+// CHECK-SPIRV: ret <3 x half> %hlsl.ddx.coarse
+half3 test_f16_ddx_coarse3(half3 val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <4 x half> @_Z20test_f16_ddx_coarse4Dv4_Dh
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <4 x half> @llvm.dx.ddx.coarse.v4f16(<4 x half> %{{.*}})
+// CHECK: ret <4 x half> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <4 x half> @_Z20test_f16_ddx_coarse4Dv4_Dh
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <4 x half> @llvm.spv.ddx.coarse.v4f16(<4 x half> %{{.*}})
+// CHECK-SPIRV: ret <4 x half> %hlsl.ddx.coarse
+half4 test_f16_ddx_coarse4(half4 val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: float @_Z19test_f32_ddx_coarsef
+// CHECK: %hlsl.ddx.coarse = call {{.*}} float @llvm.dx.ddx.coarse.f32(float %{{.*}})
+// CHECK: ret float %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: float @_Z19test_f32_ddx_coarsef
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} float @llvm.spv.ddx.coarse.f32(float %{{.*}})
+// CHECK-SPIRV: ret float %hlsl.ddx.coarse
+float test_f32_ddx_coarse(float val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <2 x float> @_Z20test_f32_ddx_coarse2Dv2_f
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <2 x float> @llvm.dx.ddx.coarse.v2f32(<2 x float> %{{.*}})
+// CHECK: ret <2 x float> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <2 x float> @_Z20test_f32_ddx_coarse2Dv2_f
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <2 x float> @llvm.spv.ddx.coarse.v2f32(<2 x float> %{{.*}})
+// CHECK-SPIRV: ret <2 x float> %hlsl.ddx.coarse
+float2 test_f32_ddx_coarse2(float2 val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <3 x float> @_Z20test_f32_ddx_coarse3Dv3_f
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <3 x float> @llvm.dx.ddx.coarse.v3f32(<3 x float> %{{.*}})
+// CHECK: ret <3 x float> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <3 x float> @_Z20test_f32_ddx_coarse3Dv3_f
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <3 x float> @llvm.spv.ddx.coarse.v3f32(<3 x float> %{{.*}})
+// CHECK-SPIRV: ret <3 x float> %hlsl.ddx.coarse
+float3 test_f32_ddx_coarse3(float3 val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <4 x float> @_Z20test_f32_ddx_coarse4Dv4_f
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <4 x float> @llvm.dx.ddx.coarse.v4f32(<4 x float> %{{.*}})
+// CHECK: ret <4 x float> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <4 x float> @_Z20test_f32_ddx_coarse4Dv4_f
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <4 x float> @llvm.spv.ddx.coarse.v4f32(<4 x float> %{{.*}})
+// CHECK-SPIRV: ret <4 x float> %hlsl.ddx.coarse
+float4 test_f32_ddx_coarse4(float4 val) {
+    return ddx_coarse(val);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/ddy-coarse-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/ddy-coarse-builtin.hlsl
new file mode 100644
index 0000000000000..2967deb75031f
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/ddy-coarse-builtin.hlsl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple spirv-pc-vulkan-compute  %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK-SPIRV
+
+// CHECK-LABEL: half @_Z19test_f16_ddy_coarseDh
+// CHECK: %hlsl.ddy.coarse = call {{.*}} half @llvm.dx.ddy.coarse.f16(half %{{.*}})
+// CHECK: ret half %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: half @_Z19test_f16_ddy_coarseDh
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} half @llvm.spv.ddy.coarse.f16(half %{{.*}})
+// CHECK-SPIRV: ret half %hlsl.ddy.coarse
+half test_f16_ddy_coarse(half val) {
+    return __builtin_hlsl_elementwise_ddy_coarse(val);
+}
+
+// CHECK-LABEL: float @_Z19test_f32_ddy_coarsef
+// CHECK: %hlsl.ddy.coarse = call {{.*}} float @llvm.dx.ddy.coarse.f32(float %{{.*}})
+// CHECK: ret float %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: float @_Z19test_f32_ddy_coarsef
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} float @llvm.spv.ddy.coarse.f32(float %{{.*}})
+// CHECK-SPIRV: ret float %hlsl.ddy.coarse
+float test_f32_ddy_coarse(float val) {
+    return __builtin_hlsl_elementwise_ddy_coarse(val);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/ddy-coarse.hlsl b/clang/test/CodeGenHLSL/builtins/ddy-coarse.hlsl
new file mode 100644
index 0000000000000..faa972a1be326
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/ddy-coarse.hlsl
@@ -0,0 +1,86 @@
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple spirv-pc-vulkan-compute  %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK-SPIRV
+
+// CHECK-LABEL: half @_Z19test_f16_ddy_coarseDh
+// CHECK: %hlsl.ddy.coarse = call {{.*}} half @llvm.dx.ddy.coarse.f16(half %{{.*}})
+// CHECK: ret half %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: half @_Z19test_f16_ddy_coarseDh
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} half @llvm.spv.ddy.coarse.f16(half %{{.*}})
+// CHECK-SPIRV: ret half %hlsl.ddy.coarse
+half test_f16_ddy_coarse(half val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <2 x half> @_Z20test_f16_ddy_coarse2Dv2_Dh
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <2 x half> @llvm.dx.ddy.coarse.v2f16(<2 x half> %{{.*}})
+// CHECK: ret <2 x half> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <2 x half> @_Z20test_f16_ddy_coarse2Dv2_Dh
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <2 x half> @llvm.spv.ddy.coarse.v2f16(<2 x half> %{{.*}})
+// CHECK-SPIRV: ret <2 x half> %hlsl.ddy.coarse
+half2 test_f16_ddy_coarse2(half2 val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <3 x half> @_Z20test_f16_ddy_coarse3Dv3_Dh
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <3 x half> @llvm.dx.ddy.coarse.v3f16(<3 x half> %{{.*}})
+// CHECK: ret <3 x half> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <3 x half> @_Z20test_f16_ddy_coarse3Dv3_Dh
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <3 x half> @llvm.spv.ddy.coarse.v3f16(<3 x half> %{{.*}})
+// CHECK-SPIRV: ret <3 x half> %hlsl.ddy.coarse
+half3 test_f16_ddy_coarse3(half3 val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <4 x half> @_Z20test_f16_ddy_coarse4Dv4_Dh
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <4 x half> @llvm.dx.ddy.coarse.v4f16(<4 x half> %{{.*}})
+// CHECK: ret <4 x half> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <4 x half> @_Z20test_f16_ddy_coarse4Dv4_Dh
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <4 x half> @llvm.spv.ddy.coarse.v4f16(<4 x half> %{{.*}})
+// CHECK-SPIRV: ret <4 x half> %hlsl.ddy.coarse
+half4 test_f16_ddy_coarse4(half4 val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: float @_Z19test_f32_ddy_coarsef
+// CHECK: %hlsl.ddy.coarse = call {{.*}} float @llvm.dx.ddy.coarse.f32(float %{{.*}})
+// CHECK: ret float %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: float @_Z19test_f32_ddy_coarsef
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} float @llvm.spv.ddy.coarse.f32(float %{{.*}})
+// CHECK-SPIRV: ret float %hlsl.ddy.coarse
+float test_f32_ddy_coarse(float val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <2 x float> @_Z20test_f32_ddy_coarse2Dv2_f
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <2 x float> @llvm.dx.ddy.coarse.v2f32(<2 x float> %{{.*}})
+// CHECK: ret <2 x float> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <2 x float> @_Z20test_f32_ddy_coarse2Dv2_f
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <2 x float> @llvm.spv.ddy.coarse.v2f32(<2 x float> %{{.*}})
+// CHECK-SPIRV: ret <2 x float> %hlsl.ddy.coarse
+float2 test_f32_ddy_coarse2(float2 val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <3 x float> @_Z20test_f32_ddy_coarse3Dv3_f
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <3 x float> @llvm.dx.ddy.coarse.v3f32(<3 x float> %{{.*}})
+// CHECK: ret <3 x float> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <3 x float> @_Z20test_f32_ddy_coarse3Dv3_f
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <3 x float> @llvm.spv.ddy.coarse.v3f32(<3 x float> %{{.*}})
+// CHECK-SPIRV: ret <3 x float> %hlsl.ddy.coarse
+float3 test_f32_ddy_coarse3(float3 val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <4 x float> @_Z20test_f32_ddy_coarse4Dv4_f
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <4 x float> @llvm.dx.ddy.coarse.v4f32(<4 x float> %{{.*}})
+// CHECK: ret <4 x float> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <4 x float> @_Z20test_f32_ddy_coarse4Dv4_f
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <4 x float> @llvm.spv.ddy.coarse.v4f32(<4 x float> %{{.*}})
+// CHECK-SPIRV: ret <4 x float> %hlsl.ddy.coarse
+float4 test_f32_ddy_coarse4(float4 val) {
+    return ddy_coarse(val);
+}
diff --git a/clang/test/Driver/crash-ir-repro.cpp b/clang/test/Driver/crash-ir-repro.cpp
new file mode 100644
index 0000000000000..1f31a5ca1bb34
--- /dev/null
+++ b/clang/test/Driver/crash-ir-repro.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang -S -emit-llvm -o %t.ll %s
+// RUN: not %clang -S -DCRASH %s %t.ll 2>&1 | FileCheck %s
+
+// CHECK: Preprocessed source(s) and associated run script(s) are located at:
+// CHECK-NEXT: clang: note: diagnostic msg: {{.*}}.cpp
+// CHECK-NEXT: clang: note: diagnostic msg: {{.*}}.ll
+// CHECK-NEXT: clang: note: diagnostic msg: {{.*}}.sh
+
+#ifdef CRASH
+#pragma clang __debug parser_crash
+#endif
+
+int main() {
+  return 0;
+}
diff --git a/clang/test/Driver/linker-wrapper-hip-amdgcnspirv.c b/clang/test/Driver/linker-wrapper-hip-amdgcnspirv.c
new file mode 100644
index 0000000000000..429f7d3b9ee13
--- /dev/null
+++ b/clang/test/Driver/linker-wrapper-hip-amdgcnspirv.c
@@ -0,0 +1,16 @@
+// RUN: %clang -cc1 %s -triple "spirv64-amd-amdhsa" -emit-llvm-bc -o %t.bc
+// RUN: llvm-offload-binary -o %t.out "--image=file=%t.bc,triple=spirv64-amd-amdhsa,arch=amdgcnspirv,kind=hip"
+// RUN: clang-linker-wrapper \
+// RUN:     "--should-extract=amdgcnspirv" \
+// RUN:     "--host-triple=spirv64-amd-amdhsa" \
+// RUN:     "--linker-path=clang-offload-bundler" \
+// RUN:     "--emit-fatbin-only" \
+// RUN:     "-o" "%t.hipfb" \
+// RUN:     "%t.out" \
+// RUN:     --dry-run \
+// RUN: 2>&1 | FileCheck %s
+
+// clang-linker-wrapper was previously calling clang-offload-bundler with -targets=...,hip-amdgcn-amd-amdhsa--amdgcnspirv
+// This caused the runtime not to recognise the triple for the AMD SPIR-V code.
+
+// CHECK: {{".*clang-offload-bundler.*"}} {{.*}} -targets={{.*}},hip-spirv64-amd-amdhsa--amdgcnspirv
diff --git a/clang/test/Sema/type-dependent-attrs.c b/clang/test/Sema/type-dependent-attrs.c
new file mode 100644
index 0000000000000..13068b3f94ad4
--- /dev/null
+++ b/clang/test/Sema/type-dependent-attrs.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -std=c23 -fsyntax-only -verify %s
+
+int open() { return 0; }
+void close(typeof(open()) *) {}
+
+void cleanup_attr() {
+  int fd_int [[gnu::cleanup(close)]] = open();
+  auto fd_auto [[gnu::cleanup(close)]] = open();
+  float fd_invalid [[gnu::cleanup(close)]] = open(); // expected-error {{'cleanup' function 'close' parameter has type 'typeof (open()) *' (aka 'int *') which is incompatible with type 'float *'}}
+}
diff --git a/clang/test/SemaCXX/attr-cleanup.cpp b/clang/test/SemaCXX/attr-cleanup.cpp
index 32d10683edebb..6048b4e92ec3f 100644
--- a/clang/test/SemaCXX/attr-cleanup.cpp
+++ b/clang/test/SemaCXX/attr-cleanup.cpp
@@ -27,3 +27,28 @@ namespace E {
     int v1 __attribute__((cleanup(c3))); // expected-error {{'c3' is not a single function}}
   }
 }
+
+namespace F {
+  int open() { return 0; }
+  void close(decltype(open()) *) {}
+
+  void test1() {
+    auto fd [[gnu::cleanup(close)]] = open();
+  }
+
+  template <typename Ty>
+  void test2() {
+    Ty fd [[gnu::cleanup(close)]] = open();
+  }
+
+  template <typename Ty>
+  void test3() {
+    Ty fd [[gnu::cleanup(close)]] = open(); // #TEST3_CLEANUP
+  }
+
+  int main() {
+    test2<int>();
+    test3<float>(); // expected-error@#TEST3_CLEANUP {{'cleanup' function 'close' parameter has type 'decltype(open()) *' (aka 'int *') which is incompatible with type 'float *'}} \
+                       expected-note {{in instantiation of function template specialization 'F::test3<float>' requested here}}
+  }
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/ddx-coarse-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/ddx-coarse-errors.hlsl
new file mode 100644
index 0000000000000..ebad1cc6826d8
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/ddx-coarse-errors.hlsl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-library %s -fnative-half-type -verify
+
+float no_arg() {
+  return __builtin_hlsl_elementwise_ddx_coarse();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float too_many_args(float val) {
+  return __builtin_hlsl_elementwise_ddx_coarse(val, val);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+float test_integer_scalar_input(int val) {
+  return __builtin_hlsl_elementwise_ddx_coarse(val);
+  // expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'int')}}
+}
+
+double test_double_scalar_input(double val) {
+  return __builtin_hlsl_elementwise_ddx_coarse(val);
+  // expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double')}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/ddy-coarse-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/ddy-coarse-errors.hlsl
new file mode 100644
index 0000000000000..9cc23665882c8
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/ddy-coarse-errors.hlsl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-library %s -fnative-half-type -verify
+
+float no_arg() {
+  return __builtin_hlsl_elementwise_ddy_coarse();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float too_many_args(float val) {
+  return __builtin_hlsl_elementwise_ddy_coarse(val, val);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+float test_integer_scalar_input(int val) {
+  return __builtin_hlsl_elementwise_ddy_coarse(val);
+  // expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'int')}}
+}
+
+double test_double_scalar_input(double val) {
+  return __builtin_hlsl_elementwise_ddy_coarse(val);
+  // expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double')}}
+}
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 1ef39d9f2d370..8f4a06526d24d 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -439,8 +439,11 @@ fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
         Args.MakeArgString(Twine("-compression-level=") + Arg->getValue()));
 
   SmallVector<StringRef> Targets = {"-targets=host-x86_64-unknown-linux-gnu"};
-  for (const auto &[File, Arch] : InputFiles)
-    Targets.push_back(Saver.save("hip-amdgcn-amd-amdhsa--" + Arch));
+  for (const auto &[File, Arch] : InputFiles) {
+    Targets.push_back(Saver.save(Arch == "amdgcnspirv"
+                                     ? "hip-spirv64-amd-amdhsa--" + Arch
+                                     : "hip-amdgcn-amd-amdhsa--" + Arch));
+  }
   CmdArgs.push_back(Saver.save(llvm::join(Targets, ",")));
 
 #ifdef _WIN32
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index e49dcb9b70b0f..bee9a01a3b01a 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -5045,6 +5045,26 @@ void EmitClangAttrParsedAttrKinds(const RecordKeeper &Records,
      << "}\n";
 }
 
+// Emits Sema calls for type dependent attributes
+void EmitClangAttrIsTypeDependent(const RecordKeeper &Records,
+                                  raw_ostream &OS) {
+  emitSourceFileHeader("Attribute is type dependent", OS, Records);
+
+  OS << "void checkAttrIsTypeDependent(Decl *D, const Attr *A) {\n";
+  OS << "  switch (A->getKind()) {\n";
+  OS << "  default:\n";
+  OS << "    break;\n";
+  for (const auto *A : Records.getAllDerivedDefinitions("Attr")) {
+    if (A->getValueAsBit("IsTypeDependent")) {
+      OS << "  case attr::" << A->getName() << ":\n";
+      OS << "    ActOn" << A->getName() << "Attr(D, A);\n";
+      OS << "    break;\n";
+    }
+  }
+  OS << "  }\n";
+  OS << "}\n";
+}
+
 // Emits the code to dump an attribute.
 void EmitClangAttrTextNodeDump(const RecordKeeper &Records, raw_ostream &OS) {
   emitSourceFileHeader("Attribute text node dumper", OS, Records);
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index 866040d503646..707ce617cb2d0 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -43,6 +43,7 @@ enum ActionType {
   GenClangAttrParsedAttrList,
   GenClangAttrParsedAttrImpl,
   GenClangAttrParsedAttrKinds,
+  GenClangAttrIsTypeDependent,
   GenClangAttrTextNodeDump,
   GenClangAttrNodeTraverse,
   GenClangBasicReader,
@@ -179,6 +180,9 @@ cl::opt<ActionType> Action(
         clEnumValN(GenClangAttrParsedAttrKinds,
                    "gen-clang-attr-parsed-attr-kinds",
                    "Generate a clang parsed attribute kinds"),
+        clEnumValN(GenClangAttrIsTypeDependent,
+                   "gen-clang-attr-is-type-dependent",
+                   "Generate clang is type dependent attribute code"),
         clEnumValN(GenClangAttrTextNodeDump, "gen-clang-attr-text-node-dump",
                    "Generate clang attribute text node dumper"),
         clEnumValN(GenClangAttrNodeTraverse, "gen-clang-attr-node-traverse",
@@ -423,6 +427,9 @@ bool ClangTableGenMain(raw_ostream &OS, const RecordKeeper &Records) {
   case GenClangAttrParsedAttrKinds:
     EmitClangAttrParsedAttrKinds(Records, OS);
     break;
+  case GenClangAttrIsTypeDependent:
+    EmitClangAttrIsTypeDependent(Records, OS);
+    break;
   case GenClangAttrTextNodeDump:
     EmitClangAttrTextNodeDump(Records, OS);
     break;
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index fa49dcd289bc2..058bda3ebd246 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -82,6 +82,8 @@ void EmitClangAttrParsedAttrImpl(const llvm::RecordKeeper &Records,
                                  llvm::raw_ostream &OS);
 void EmitClangAttrParsedAttrKinds(const llvm::RecordKeeper &Records,
                                   llvm::raw_ostream &OS);
+void EmitClangAttrIsTypeDependent(const llvm::RecordKeeper &Records,
+                                  llvm::raw_ostream &OS);
 void EmitClangAttrTextNodeDump(const llvm::RecordKeeper &Records,
                                llvm::raw_ostream &OS);
 void EmitClangAttrNodeTraverse(const llvm::RecordKeeper &Records,
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 3f7dd8e402b78..ea22fb0babc46 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -875,7 +875,7 @@ def is_windows_lto_supported():
         config.substitutions.append(
             (
                 "%ld_flags_rpath_so" + postfix,
-                "-install_name @rpath/`basename %dynamiclib{}`".format(postfix),
+                "-install_name @rpath/%base_dynamiclib{}".format(postfix),
             )
         )
     elif config.target_os in ("FreeBSD", "NetBSD", "OpenBSD"):
@@ -908,6 +908,9 @@ def is_windows_lto_supported():
     config.substitutions.append(
         ("%dynamiclib" + postfix, "%t.dir/%xdynamiclib_filename" + postfix)
     )
+    config.substitutions.append(
+        ("%base_dynamiclib" + postfix, "%xdynamiclib_filename" + postfix)
+    )
     config.substitutions.append(
         (
             "%xdynamiclib_filename" + postfix,
diff --git a/compiler-rt/test/rtsan/Darwin/dlopen.cpp b/compiler-rt/test/rtsan/Darwin/dlopen.cpp
index 1aabe5cb6e580..435a4353b7026 100644
--- a/compiler-rt/test/rtsan/Darwin/dlopen.cpp
+++ b/compiler-rt/test/rtsan/Darwin/dlopen.cpp
@@ -8,18 +8,19 @@
 // RUN: %clangxx -fsanitize=realtime %s -o %t.so -shared -DSHARED_LIB
 // RUN: %clangxx %s -o %t
 
-// RUN: RTSAN_DYLIB_PATH=`%clangxx -fsanitize=realtime %s -### 2>&1 \
+// RUN: %clangxx -fsanitize=realtime %s -### 2>&1 \
 // RUN:   | grep "libclang_rt.rtsan_osx_dynamic.dylib" \
-// RUN:   | sed -e 's/.*"\(.*libclang_rt.rtsan_osx_dynamic.dylib\)".*/\1/'`
+// RUN:   | sed -e 's/.*"\(.*libclang_rt.rtsan_osx_dynamic.dylib\)".*/\1/' \
+// RUN:   | tr -d '\n' > %t.rtsan_dylib_path
 
 // Launching a non-instrumented binary that dlopen's an instrumented library should fail.
 // RUN: not %run %t %t.so 2>&1 | FileCheck %s --check-prefix=CHECK-FAIL
 // Launching a non-instrumented binary with an explicit DYLD_INSERT_LIBRARIES should work.
-// RUN: DYLD_INSERT_LIBRARIES=$RTSAN_DYLIB_PATH %run %t %t.so 2>&1 | FileCheck %s
+// RUN: env DYLD_INSERT_LIBRARIES="%{readfile:%t.rtsan_dylib_path}" %run %t %t.so 2>&1 | FileCheck %s
 
 // Launching an instrumented binary with the DYLD_INSERT_LIBRARIES env variable has no error
 // RUN: %clangxx -fsanitize=realtime %s -o %t
-// RUN: DYLD_INSERT_LIBRARIES=$RTSAN_DYLIB_PATH %run %t %t.so 2>&1 | FileCheck %s --check-prefix=CHECK-INSTRUMENTED
+// RUN: env DYLD_INSERT_LIBRARIES="%{readfile:%t.rtsan_dylib_path}" %run %t %t.so 2>&1 | FileCheck %s --check-prefix=CHECK-INSTRUMENTED
 
 #include <dlfcn.h>
 #include <stdio.h>
diff --git a/compiler-rt/test/tsan/Darwin/dlopen.cpp b/compiler-rt/test/tsan/Darwin/dlopen.cpp
index 3d12b815f9c25..2ab052f1c0c26 100644
--- a/compiler-rt/test/tsan/Darwin/dlopen.cpp
+++ b/compiler-rt/test/tsan/Darwin/dlopen.cpp
@@ -9,14 +9,15 @@
 // RUN: %clangxx_tsan %s -o %t.so -shared -DSHARED_LIB
 // RUN: %clangxx_tsan -fno-sanitize=thread %s -o %t
 
-// RUN: TSAN_DYLIB_PATH=`%clangxx_tsan %s -### 2>&1 \
+// RUN: %clangxx_tsan %s -### 2>&1 \
 // RUN:   | grep "libclang_rt.tsan_osx_dynamic.dylib" \
-// RUN:   | sed -e 's/.*"\(.*libclang_rt.tsan_osx_dynamic.dylib\)".*/\1/'`
+// RUN:   | sed -e 's/.*"\(.*libclang_rt.tsan_osx_dynamic.dylib\)".*/\1/' \
+// RUN:   | tr -d '\n' > %t.tsan_dylib_path
 
 // Launching a non-instrumented binary that dlopen's an instrumented library should fail.
 // RUN: not %run %t %t.so 2>&1 | FileCheck %s --check-prefix=CHECK-FAIL
 // Launching a non-instrumented binary with an explicit DYLD_INSERT_LIBRARIES should work.
-// RUN: DYLD_INSERT_LIBRARIES=$TSAN_DYLIB_PATH %run %t %t.so 2>&1 | FileCheck %s
+// RUN: env DYLD_INSERT_LIBRARIES="%{readfile:%t.tsan_dylib_path}" %run %t %t.so 2>&1 | FileCheck %s
 
 #include <dlfcn.h>
 #include <pthread.h>
diff --git a/compiler-rt/test/tsan/Darwin/external-ignore-noninstrumented.cpp b/compiler-rt/test/tsan/Darwin/external-ignore-noninstrumented.cpp
index 916b0b893fc0d..cfa46e0f0a213 100644
--- a/compiler-rt/test/tsan/Darwin/external-ignore-noninstrumented.cpp
+++ b/compiler-rt/test/tsan/Darwin/external-ignore-noninstrumented.cpp
@@ -1,8 +1,10 @@
+// RUN: basename %t-lib.dylib | tr -d '\n' > %t.basename
 // RUN: %clangxx_tsan -shared %p/external-lib.cpp -fno-sanitize=thread -DUSE_TSAN_CALLBACKS \
-// RUN:   -o %t-lib.dylib -install_name @rpath/`basename %t-lib.dylib`
+// RUN:   -o %t-lib.dylib -install_name @rpath/%{readfile:%t.basename}
 
+// RUN: basename %t-module.dylib | tr -d '\n' > %t.basename
 // RUN: %clangxx_tsan -shared %p/external-noninstrumented-module.cpp %t-lib.dylib -fno-sanitize=thread \
-// RUN:   -o %t-module.dylib -install_name @rpath/`basename %t-module.dylib`
+// RUN:   -o %t-module.dylib -install_name @rpath/%{readfile:%t.basename}
 
 // RUN: %clangxx_tsan %s %t-module.dylib -o %t
 // RUN: %run %t 2>&1 | FileCheck %s
diff --git a/compiler-rt/test/tsan/Darwin/external.cpp b/compiler-rt/test/tsan/Darwin/external.cpp
index bf189eb1d6b5b..52fae36f0e1f4 100644
--- a/compiler-rt/test/tsan/Darwin/external.cpp
+++ b/compiler-rt/test/tsan/Darwin/external.cpp
@@ -1,14 +1,17 @@
+// RUN: basename %t-lib-instrumented.dylib | tr -d '\n' > %t.basename
 // RUN: %clangxx_tsan %p/external-lib.cpp -shared \
 // RUN:                               -o %t-lib-instrumented.dylib \
-// RUN:   -install_name @rpath/`basename %t-lib-instrumented.dylib`
+// RUN:   -install_name @rpath/%{readfile:%t.basename}
 
+// RUN: basename %t-lib-noninstrumented.dylib | tr -d '\n' > %t.basename
 // RUN: %clangxx_tsan %p/external-lib.cpp -shared -fno-sanitize=thread \
 // RUN:                               -o %t-lib-noninstrumented.dylib \
-// RUN:   -install_name @rpath/`basename %t-lib-noninstrumented.dylib`
+// RUN:   -install_name @rpath/%{readfile:%t.basename}
 
+// RUN: basename %t-lib-noninstrumented-callbacks.dylib | tr -d '\n' > %t.basename
 // RUN: %clangxx_tsan %p/external-lib.cpp -shared -fno-sanitize=thread -DUSE_TSAN_CALLBACKS \
 // RUN:                               -o %t-lib-noninstrumented-callbacks.dylib \
-// RUN:   -install_name @rpath/`basename %t-lib-noninstrumented-callbacks.dylib`
+// RUN:   -install_name @rpath/%{readfile:%t.basename}
 
 // RUN: %clangxx_tsan %s %t-lib-instrumented.dylib -o %t-lib-instrumented
 // RUN: %clangxx_tsan %s %t-lib-noninstrumented.dylib -o %t-lib-noninstrumented
diff --git a/compiler-rt/test/tsan/Darwin/malloc-stack-logging.cpp b/compiler-rt/test/tsan/Darwin/malloc-stack-logging.cpp
index 8d9c2122d0e6c..0a96e346f8012 100644
--- a/compiler-rt/test/tsan/Darwin/malloc-stack-logging.cpp
+++ b/compiler-rt/test/tsan/Darwin/malloc-stack-logging.cpp
@@ -4,7 +4,7 @@
 // use syscalls directly) to make sure other interceptors aren't called.
 
 // RUN: %clangxx_tsan -O1 %s -o %t
-// RUN: MallocStackLogging=1 %run %t 2>&1 | FileCheck %s
+// RUN: env MallocStackLogging=1 %run %t 2>&1 | FileCheck %s
 #include <pthread.h>
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 8f461599ffd5b..d79ace0cbb896 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -546,6 +546,12 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DEPRECATED_(m)
 #  endif
 
+#  if defined(__DEPRECATED) && __DEPRECATED && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS)
+#    define _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS 1
+#  else
+#    define _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS 0
+#  endif
+
 #  if !defined(_LIBCPP_CXX03_LANG)
 #    define _LIBCPP_DEPRECATED_IN_CXX11 _LIBCPP_DEPRECATED
 #  else
diff --git a/libcxx/include/ccomplex b/libcxx/include/ccomplex
index ee7e088aac54d..c1cb039f83a5e 100644
--- a/libcxx/include/ccomplex
+++ b/libcxx/include/ccomplex
@@ -26,18 +26,10 @@
 #    pragma GCC system_header
 #  endif
 
-#  if _LIBCPP_STD_VER >= 20
-
-using __standard_header_ccomplex
-    _LIBCPP_DEPRECATED_("removed in C++20. Include <complex> instead.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_ccomplex _LIBCPP_NODEBUG                                    = __standard_header_ccomplex;
-
-#  elif _LIBCPP_STD_VER >= 17
-
-using __standard_header_ccomplex _LIBCPP_DEPRECATED_("Include <complex> instead.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_ccomplex _LIBCPP_NODEBUG = __standard_header_ccomplex;
-
+#  if _LIBCPP_STD_VER >= 17 && !__building_module(std) && _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS
+#    warning <ccomplex> is deprecated in C++17 and removed in C++20. Include <complex> instead.
 #  endif
+
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_CCOMPLEX
diff --git a/libcxx/include/ciso646 b/libcxx/include/ciso646
index 34164362dc10d..d9eae41291024 100644
--- a/libcxx/include/ciso646
+++ b/libcxx/include/ciso646
@@ -24,13 +24,10 @@
 #    pragma GCC system_header
 #  endif
 
-#  if _LIBCPP_STD_VER >= 20
-
-using __standard_header_ciso646
-    _LIBCPP_DEPRECATED_("removed in C++20. Include <version> instead.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_ciso646 _LIBCPP_NODEBUG                                     = __standard_header_ciso646;
-
+#  if _LIBCPP_STD_VER >= 20 && !__building_module(std) && _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS
+#    warning <ciso646> is removed in C++20. Include <version> instead.
 #  endif
+
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_CISO646
diff --git a/libcxx/include/cstdalign b/libcxx/include/cstdalign
index 7f8dd1e1fbaf8..7aa8cc81ad14c 100644
--- a/libcxx/include/cstdalign
+++ b/libcxx/include/cstdalign
@@ -43,17 +43,10 @@ Macros:
 #  undef __alignof_is_defined
 #  define __alignof_is_defined 1
 
-#  if _LIBCPP_STD_VER >= 20
-
-using __standard_header_cstdalign _LIBCPP_DEPRECATED_("removed in C++20.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_cstdalign _LIBCPP_NODEBUG = __standard_header_cstdalign;
-
-#  elif _LIBCPP_STD_VER >= 17
-
-using __standard_header_cstdalign _LIBCPP_DEPRECATED _LIBCPP_NODEBUG = void;
-using __use_standard_header_cstdalign _LIBCPP_NODEBUG                = __standard_header_cstdalign;
-
+#  if _LIBCPP_STD_VER >= 17 && !__building_module(std) && _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS
+#    warning <cstdalign> is deprecated in C++17 and removed in C++20.
 #  endif
+
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_CSTDALIGN
diff --git a/libcxx/include/cstdbool b/libcxx/include/cstdbool
index a432d5f08b9ae..805a287bd7627 100644
--- a/libcxx/include/cstdbool
+++ b/libcxx/include/cstdbool
@@ -31,17 +31,10 @@ Macros:
 #  undef __bool_true_false_are_defined
 #  define __bool_true_false_are_defined 1
 
-#  if _LIBCPP_STD_VER >= 20
-
-using __standard_header_cstdbool _LIBCPP_DEPRECATED_("removed in C++20.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_cstdbool _LIBCPP_NODEBUG                                      = __standard_header_cstdbool;
-
-#  elif _LIBCPP_STD_VER >= 17
-
-using __standard_header_cstdbool _LIBCPP_DEPRECATED _LIBCPP_NODEBUG = void;
-using __use_standard_header_cstdbool _LIBCPP_NODEBUG                = __standard_header_cstdbool;
-
+#  if _LIBCPP_STD_VER >= 17 && !__building_module(std) && _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS
+#    warning <cstdbool> is deprecated in C++17 and removed in C++20.
 #  endif
+
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_CSTDBOOL
diff --git a/libcxx/include/ctgmath b/libcxx/include/ctgmath
index db0786f1e2c46..13b7a96e4d8fc 100644
--- a/libcxx/include/ctgmath
+++ b/libcxx/include/ctgmath
@@ -28,17 +28,8 @@
 #    pragma GCC system_header
 #  endif
 
-#  if _LIBCPP_STD_VER >= 20
-
-using __standard_header_ctgmath
-    _LIBCPP_DEPRECATED_("removed in C++20. Include <cmath> and <complex> instead.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_ctgmath _LIBCPP_NODEBUG = __standard_header_ctgmath;
-
-#  elif _LIBCPP_STD_VER >= 17
-
-using __standard_header_ctgmath _LIBCPP_DEPRECATED_("Include <cmath> and <complex> instead.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_ctgmath _LIBCPP_NODEBUG = __standard_header_ctgmath;
-
+#  if _LIBCPP_STD_VER >= 17 && !__building_module(std) && _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS
+#    warning <ctgmath> is deprecated in C++17 and removed in C++20. Include <cmath> and <complex> instead.
 #  endif
 
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py
index 6ed35af7e275e..2b643e1f2ad48 100644
--- a/libcxx/test/libcxx/transitive_includes.gen.py
+++ b/libcxx/test/libcxx/transitive_includes.gen.py
@@ -89,7 +89,7 @@
 // UNSUPPORTED: LIBCXX-FREEBSD-FIXME
 
 // RUN: mkdir %t
-// RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt
+// RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} -Wno-deprecated --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt
 // RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes/to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
 // RUN: cat %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv
 // RUN: diff -w %t/expected_transitive_includes.csv %t/actual_transitive_includes.csv
diff --git a/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp
index 0eaf82ce5cef0..8df89d0ba9206 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp
@@ -14,12 +14,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
-#include "test_macros.h"
-
 #include <ccomplex>
 
-#if TEST_STD_VER >= 20
-// expected-warning@ccomplex:* {{'__standard_header_ccomplex' is deprecated: removed in C++20. Include <complex> instead.}}
-#else
-// expected-warning@ccomplex:* {{'__standard_header_ccomplex' is deprecated: Include <complex> instead.}}
-#endif
+// expected-warning@ccomplex:* {{<ccomplex> is deprecated in C++17 and removed in C++20. Include <complex> instead.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp
index 04acd10081548..32b57033331c8 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp
@@ -15,4 +15,5 @@
 // UNSUPPORTED: clang-modules-build
 
 #include <ciso646>
-// expected-warning@ciso646:* {{'__standard_header_ciso646' is deprecated: removed in C++20. Include <version> instead.}}
+
+// expected-warning@ciso646:* {{<ciso646> is removed in C++20. Include <version> instead.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp
index dc9f1af55b3f1..23a7709a9d658 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp
@@ -14,12 +14,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
-#include "test_macros.h"
-
 #include <cstdalign>
 
-#if TEST_STD_VER >= 20
-// expected-warning@cstdalign:* {{'__standard_header_cstdalign' is deprecated: removed in C++20.}}
-#else
-// expected-warning@cstdalign:* {{'__standard_header_cstdalign' is deprecated}}
-#endif
+// expected-warning@cstdalign:* {{<cstdalign> is deprecated in C++17 and removed in C++20.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp
index eddefe14d35ea..c2c0f03c52d3c 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp
@@ -14,12 +14,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
-#include "test_macros.h"
-
 #include <cstdbool>
 
-#if TEST_STD_VER >= 20
-// expected-warning@cstdbool:* {{'__standard_header_cstdbool' is deprecated: removed in C++20.}}
-#else
-// expected-warning@cstdbool:* {{'__standard_header_cstdbool' is deprecated}}
-#endif
+// expected-warning@cstdbool:* {{<cstdbool> is deprecated in C++17 and removed in C++20.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp
index 097ab1643d15a..4f5564915443d 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp
@@ -14,12 +14,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
-#include "test_macros.h"
-
 #include <ctgmath>
 
-#if TEST_STD_VER >= 20
-// expected-warning@ctgmath:* {{'__standard_header_ctgmath' is deprecated: removed in C++20. Include <cmath> and <complex> instead.}}
-#else
-// expected-warning@ctgmath:* {{'__standard_header_ctgmath' is deprecated: Include <cmath> and <complex> instead.}}
-#endif
+// expected-warning@ctgmath:* {{<ctgmath> is deprecated in C++17 and removed in C++20. Include <cmath> and <complex> instead.}}
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
index 12d778408d5ec..e58e760a5ce81 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: std-at-least-c++23
+// REQUIRES: std-at-least-c++26
 
 // <tuple>
 
@@ -21,11 +21,6 @@
 void test() {
   // expected-error@*:* {{static assertion failed}}
 
-  // Turns to an error since C++26 (Disallow Binding a Returned Glvalue to a Temporary https://wg21.link/P2748R5).
-#if TEST_STD_VER >= 26
   // expected-error@tuple:* {{returning reference to local temporary object}}
-#else
-  // expected-warning@tuple:* {{returning reference to local temporary object}}
-#endif
   std::ignore = std::make_from_tuple<const int&>(std::tuple<char>{});
 }
diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py
index 975209c273f8c..76e9115295b99 100644
--- a/libcxx/utils/libcxx/test/format.py
+++ b/libcxx/utils/libcxx/test/format.py
@@ -99,7 +99,7 @@ def parseScript(test, preamble):
     substitutions.append(
         (
             "%{verify}",
-            "%{cxx} %s %{flags} %{compile_flags} -fsyntax-only -Wno-error -Xclang -verify -Xclang -verify-ignore-unexpected=note -ferror-limit=0",
+            "%{cxx} %s %{flags} %{compile_flags} -U_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER -fsyntax-only -Wno-error -Xclang -verify -Xclang -verify-ignore-unexpected=note -ferror-limit=0",
         )
     )
     substitutions.append(("%{run}", "%{exec} %t.exe"))
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 2a97df4785ecb..b0dc797292511 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -762,7 +762,7 @@ void AArch64::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel,
     relocateNoSym(loc, R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC, val);
     break;
   default:
-    llvm_unreachable("unsupported relocation for TLS GD to LE relaxation");
+    llvm_unreachable("unsupported relocation for TLS GD to IE relaxation");
   }
 }
 
diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
index 9a2e73a1e3718..84c03cd6432ed 100644
--- a/llvm/cmake/modules/TableGen.cmake
+++ b/llvm/cmake/modules/TableGen.cmake
@@ -66,6 +66,16 @@ function(tablegen project ofn)
     list(APPEND LLVM_TABLEGEN_FLAGS "-omit-comments")
   endif()
 
+  set(EXTRA_OUTPUTS)
+  if("-gen-register-info" IN_LIST ARGN)
+    cmake_path(GET ofn STEM OUTPUT_BASENAME)
+    list(APPEND EXTRA_OUTPUTS
+         ${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_BASENAME}Enums.inc
+         ${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_BASENAME}Header.inc
+         ${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_BASENAME}MCDesc.inc
+         ${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_BASENAME}TargetDesc.inc)
+  endif()
+
   # MSVC can't support long string literals ("long" > 65534 bytes)[1], so if there's
   # a possibility of generated tables being consumed by MSVC, generate arrays of
   # char literals, instead. If we're cross-compiling, then conservatively assume
@@ -126,7 +136,7 @@ function(tablegen project ofn)
     set(LLVM_TABLEGEN_JOB_POOL "")
   endif()
 
-  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn}
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} ${EXTRA_OUTPUTS}
     COMMAND ${tablegen_exe} ${ARG_UNPARSED_ARGUMENTS}
     ${tblgen_includes}
     ${LLVM_TABLEGEN_FLAGS}
diff --git a/llvm/docs/TableGen/BackEnds.rst b/llvm/docs/TableGen/BackEnds.rst
index 7f571378860b2..1e3cb8783df16 100644
--- a/llvm/docs/TableGen/BackEnds.rst
+++ b/llvm/docs/TableGen/BackEnds.rst
@@ -355,6 +355,13 @@ ClangAttrParsedAttrKinds
 ``AttributeList::getKind`` function, mapping a string (and syntax) to a parsed
 attribute ``AttributeList::Kind`` enumeration.
 
+ClangAttrIsTypeDependent
+------------------------
+
+**Purpose**: Creates ``AttrIsTypeDependent.inc``, which is used to implement the
+``Sema::CheckAttributesOnDeducedType`` function, mapping an attribute kind to a
+Sema function if it exists.
+
 ClangAttrDump
 -------------
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index cec7d09f494d6..4c932c523e423 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3492,6 +3492,13 @@ class LLVM_ABI TargetLoweringBase {
     return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT));
   }
 
+  // Return true if the target wants to optimize the mul overflow intrinsic
+  // for the given \p VT.
+  virtual bool shouldOptimizeMulOverflowWithZeroHighBits(LLVMContext &Context,
+                                                         EVT VT) const {
+    return false;
+  }
+
   // Return true if it is profitable to use a scalar input to a BUILD_VECTOR
   // even if the vector itself has multiple uses.
   virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const {
diff --git a/llvm/include/llvm/IR/Constant.h b/llvm/include/llvm/IR/Constant.h
index 0be1fc172ebd4..e8ce453559ed7 100644
--- a/llvm/include/llvm/IR/Constant.h
+++ b/llvm/include/llvm/IR/Constant.h
@@ -79,6 +79,9 @@ class Constant : public User {
   /// Return true if the value is the smallest signed value.
   LLVM_ABI bool isMinSignedValue() const;
 
+  /// Return true if the value is the largest signed value.
+  LLVM_ABI bool isMaxSignedValue() const;
+
   /// Return true if this is a finite and non-zero floating-point scalar
   /// constant or a fixed width vector constant with all finite and non-zero
   /// elements.
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index d7db935ee07f1..5a4cc776b26a5 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -170,6 +170,8 @@ def int_dx_splitdouble : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMMatchType<0>
     [LLVMScalarOrSameVectorWidth<0, llvm_double_ty>], [IntrNoMem]>;
 def int_dx_radians : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_discard : DefaultAttrsIntrinsic<[], [llvm_i1_ty], []>;
+def int_dx_ddx_coarse : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+def int_dx_ddy_coarse : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
 def int_dx_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
 def int_dx_firstbitlow : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index f39c6cda2c579..2f7c25550a0cc 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -134,6 +134,8 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
   def int_spv_group_memory_barrier_with_group_sync
       : DefaultAttrsIntrinsic<[], [], [IntrConvergent]>;
   def int_spv_discard : DefaultAttrsIntrinsic<[], [], []>;
+  def int_spv_ddx_coarse : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_spv_ddy_coarse : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_uclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_sclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_nclamp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
diff --git a/llvm/include/llvm/TableGen/Main.h b/llvm/include/llvm/TableGen/Main.h
index bafce3a463acc..daede9f5a46f0 100644
--- a/llvm/include/llvm/TableGen/Main.h
+++ b/llvm/include/llvm/TableGen/Main.h
@@ -14,7 +14,6 @@
 #define LLVM_TABLEGEN_MAIN_H
 
 #include "llvm/Support/CommandLine.h"
-#include <functional>
 #include <map>
 
 namespace llvm {
@@ -30,18 +29,17 @@ struct TableGenOutputFiles {
 };
 
 /// Returns true on error, false otherwise.
-using TableGenMainFn = bool(raw_ostream &OS, const RecordKeeper &Records);
+using TableGenMainFn =
+    function_ref<bool(raw_ostream &OS, const RecordKeeper &Records)>;
 
 /// Perform the action using Records, and store output in OutFiles.
 /// Returns true on error, false otherwise.
-using MultiFileTableGenMainFn = bool(TableGenOutputFiles &OutFiles,
-                                     const RecordKeeper &Records);
+using MultiFileTableGenMainFn = function_ref<bool(TableGenOutputFiles &OutFiles,
+                                                  const RecordKeeper &Records)>;
 
-int TableGenMain(const char *argv0,
-                 std::function<TableGenMainFn> MainFn = nullptr);
+int TableGenMain(const char *argv0, TableGenMainFn MainFn = nullptr);
 
-int TableGenMain(const char *argv0,
-                 std::function<MultiFileTableGenMainFn> MainFn = nullptr);
+int TableGenMain(const char *argv0, MultiFileTableGenMainFn MainFn = nullptr);
 
 /// Controls emitting large character arrays as strings or character arrays.
 /// Typically set to false when building with MSVC.
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 6f44713bd22cd..8968f6b934d77 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Statepoint.h"
@@ -6676,6 +6677,62 @@ static MinMaxOptResult OptimizeConstMinMax(const Constant *RHSConst,
   return MinMaxOptResult::CannotOptimize;
 }
 
+static Value *simplifySVEIntReduction(Intrinsic::ID IID, Type *ReturnType,
+                                      Value *Op0, Value *Op1) {
+  Constant *C0 = dyn_cast<Constant>(Op0);
+  Constant *C1 = dyn_cast<Constant>(Op1);
+  unsigned Width = ReturnType->getPrimitiveSizeInBits();
+
+  // All false predicate or reduction of neutral values ==> neutral result.
+  switch (IID) {
+  case Intrinsic::aarch64_sve_eorv:
+  case Intrinsic::aarch64_sve_orv:
+  case Intrinsic::aarch64_sve_saddv:
+  case Intrinsic::aarch64_sve_uaddv:
+  case Intrinsic::aarch64_sve_umaxv:
+    if ((C0 && C0->isNullValue()) || (C1 && C1->isNullValue()))
+      return ConstantInt::get(ReturnType, 0);
+    break;
+  case Intrinsic::aarch64_sve_andv:
+  case Intrinsic::aarch64_sve_uminv:
+    if ((C0 && C0->isNullValue()) || (C1 && C1->isAllOnesValue()))
+      return ConstantInt::get(ReturnType, APInt::getMaxValue(Width));
+    break;
+  case Intrinsic::aarch64_sve_smaxv:
+    if ((C0 && C0->isNullValue()) || (C1 && C1->isMinSignedValue()))
+      return ConstantInt::get(ReturnType, APInt::getSignedMinValue(Width));
+    break;
+  case Intrinsic::aarch64_sve_sminv:
+    if ((C0 && C0->isNullValue()) || (C1 && C1->isMaxSignedValue()))
+      return ConstantInt::get(ReturnType, APInt::getSignedMaxValue(Width));
+    break;
+  }
+
+  switch (IID) {
+  case Intrinsic::aarch64_sve_andv:
+  case Intrinsic::aarch64_sve_orv:
+  case Intrinsic::aarch64_sve_smaxv:
+  case Intrinsic::aarch64_sve_sminv:
+  case Intrinsic::aarch64_sve_umaxv:
+  case Intrinsic::aarch64_sve_uminv:
+    // sve_reduce_##(all, splat(X)) ==> X
+    if (C0 && C0->isAllOnesValue()) {
+      if (Value *SplatVal = getSplatValue(Op1)) {
+        assert(SplatVal->getType() == ReturnType && "Unexpected result type!");
+        return SplatVal;
+      }
+    }
+    break;
+  case Intrinsic::aarch64_sve_eorv:
+    // sve_reduce_xor(all, splat(X)) ==> 0
+    if (C0 && C0->isAllOnesValue())
+      return ConstantInt::get(ReturnType, 0);
+    break;
+  }
+
+  return nullptr;
+}
+
 Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
                                      Value *Op0, Value *Op1,
                                      const SimplifyQuery &Q,
@@ -7037,6 +7094,17 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
 
     break;
   }
+
+  case Intrinsic::aarch64_sve_andv:
+  case Intrinsic::aarch64_sve_eorv:
+  case Intrinsic::aarch64_sve_orv:
+  case Intrinsic::aarch64_sve_saddv:
+  case Intrinsic::aarch64_sve_smaxv:
+  case Intrinsic::aarch64_sve_sminv:
+  case Intrinsic::aarch64_sve_uaddv:
+  case Intrinsic::aarch64_sve_umaxv:
+  case Intrinsic::aarch64_sve_uminv:
+    return simplifySVEIntReduction(IID, ReturnType, Op0, Op1);
   default:
     break;
   }
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index d35c17e3afd2b..877ae146ce79f 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -616,9 +616,8 @@ void ValueEnumerator::OptimizeConstants(unsigned CstStart, unsigned CstEnd) {
 /// EnumerateValueSymbolTable - Insert all of the values in the specified symbol
 /// table into the values table.
 void ValueEnumerator::EnumerateValueSymbolTable(const ValueSymbolTable &VST) {
-  for (ValueSymbolTable::const_iterator VI = VST.begin(), VE = VST.end();
-       VI != VE; ++VI)
-    EnumerateValue(VI->getValue());
+  for (const auto &VI : VST)
+    EnumerateValue(VI.getValue());
 }
 
 /// Insert all of the values referenced by named metadata in the specified
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 7068eb440e31b..fbb35cedf5274 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -432,6 +432,8 @@ class CodeGenPrepare {
   bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy,
                           unsigned AddrSpace);
   bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
+  bool optimizeMulWithOverflow(Instruction *I, bool IsSigned,
+                               ModifyDT &ModifiedDT);
   bool optimizeInlineAsmInst(CallInst *CS);
   bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT);
   bool optimizeExt(Instruction *&I);
@@ -2798,6 +2800,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
         }
       }
       return false;
+    case Intrinsic::umul_with_overflow:
+      return optimizeMulWithOverflow(II, /*IsSigned=*/false, ModifiedDT);
+    case Intrinsic::smul_with_overflow:
+      return optimizeMulWithOverflow(II, /*IsSigned=*/true, ModifiedDT);
     }
 
     SmallVector<Value *, 2> PtrOps;
@@ -6437,6 +6443,182 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
   return true;
 }
 
+// This is a helper for CodeGenPrepare::optimizeMulWithOverflow.
+// Check the pattern we are interested in where there are maximum 2 uses
+// of the intrinsic which are the extract instructions.
+static bool matchOverflowPattern(Instruction *&I, ExtractValueInst *&MulExtract,
+                                 ExtractValueInst *&OverflowExtract) {
+  // Bail out if it's more than 2 users:
+  if (I->hasNUsesOrMore(3))
+    return false;
+
+  for (User *U : I->users()) {
+    auto *Extract = dyn_cast<ExtractValueInst>(U);
+    if (!Extract || Extract->getNumIndices() != 1)
+      return false;
+
+    unsigned Index = Extract->getIndices()[0];
+    if (Index == 0)
+      MulExtract = Extract;
+    else if (Index == 1)
+      OverflowExtract = Extract;
+    else
+      return false;
+  }
+  return true;
+}
+
+// Rewrite the mul_with_overflow intrinsic by checking if both of the
+// operands' value ranges are within the legal type. If so, we can optimize the
+// multiplication algorithm. This code is supposed to be written during the step
+// of type legalization, but given that we need to reconstruct the IR which is
+// not doable there, we do it here.
+// The IR after the optimization will look like:
+// entry:
+//   if signed:
+//     ( (lhs_lo>>BW-1) ^ lhs_hi) || ( (rhs_lo>>BW-1) ^ rhs_hi) ? overflow,
+//     overflow_no
+//   else:
+//     (lhs_hi != 0) || (rhs_hi != 0) ? overflow, overflow_no
+// overflow_no:
+// overflow:
+// overflow.res:
+// \returns true if optimization was applied
+// TODO: This optimization can be further improved to optimize branching on
+// overflow where the 'overflow_no' BB can branch directly to the false
+// successor of overflow, but that would add additional complexity so we leave
+// it for future work.
+bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
+                                             ModifyDT &ModifiedDT) {
+  // Check if target supports this optimization.
+  if (!TLI->shouldOptimizeMulOverflowWithZeroHighBits(
+          I->getContext(),
+          TLI->getValueType(*DL, I->getType()->getContainedType(0))))
+    return false;
+
+  ExtractValueInst *MulExtract = nullptr, *OverflowExtract = nullptr;
+  if (!matchOverflowPattern(I, MulExtract, OverflowExtract))
+    return false;
+
+  // Keep track of the instruction to stop reoptimizing it again.
+  InsertedInsts.insert(I);
+
+  Value *LHS = I->getOperand(0);
+  Value *RHS = I->getOperand(1);
+  Type *Ty = LHS->getType();
+  unsigned VTHalfBitWidth = Ty->getScalarSizeInBits() / 2;
+  Type *LegalTy = Ty->getWithNewBitWidth(VTHalfBitWidth);
+
+  // New BBs:
+  BasicBlock *OverflowEntryBB =
+      I->getParent()->splitBasicBlock(I, "", /*Before*/ true);
+  OverflowEntryBB->takeName(I->getParent());
+  // Keep the 'br' instruction that is generated as a result of the split to be
+  // erased/replaced later.
+  Instruction *OldTerminator = OverflowEntryBB->getTerminator();
+  BasicBlock *NoOverflowBB =
+      BasicBlock::Create(I->getContext(), "overflow.no", I->getFunction());
+  NoOverflowBB->moveAfter(OverflowEntryBB);
+  BasicBlock *OverflowBB =
+      BasicBlock::Create(I->getContext(), "overflow", I->getFunction());
+  OverflowBB->moveAfter(NoOverflowBB);
+
+  // BB overflow.entry:
+  IRBuilder<> Builder(OverflowEntryBB);
+  // Extract low and high halves of LHS:
+  Value *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs");
+  Value *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
+  HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
+
+  // Extract low and high halves of RHS:
+  Value *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs");
+  Value *HiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
+  HiRHS = Builder.CreateTrunc(HiRHS, LegalTy, "hi.rhs");
+
+  Value *IsAnyBitTrue;
+  if (IsSigned) {
+    Value *SignLoLHS =
+        Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
+    Value *SignLoRHS =
+        Builder.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs");
+    Value *XorLHS = Builder.CreateXor(HiLHS, SignLoLHS);
+    Value *XorRHS = Builder.CreateXor(HiRHS, SignLoRHS);
+    Value *Or = Builder.CreateOr(XorLHS, XorRHS, "or.lhs.rhs");
+    IsAnyBitTrue = Builder.CreateCmp(ICmpInst::ICMP_NE, Or,
+                                     ConstantInt::getNullValue(Or->getType()));
+  } else {
+    Value *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS,
+                                      ConstantInt::getNullValue(LegalTy));
+    Value *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
+                                      ConstantInt::getNullValue(LegalTy));
+    IsAnyBitTrue = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs");
+  }
+  Builder.CreateCondBr(IsAnyBitTrue, OverflowBB, NoOverflowBB);
+
+  // BB overflow.no:
+  Builder.SetInsertPoint(NoOverflowBB);
+  Value *ExtLoLHS, *ExtLoRHS;
+  if (IsSigned) {
+    ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext");
+    ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext");
+  } else {
+    ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext");
+    ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext");
+  }
+
+  Value *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.overflow.no");
+
+  // Create the 'overflow.res' BB to merge the results of
+  // the two paths:
+  BasicBlock *OverflowResBB = I->getParent();
+  OverflowResBB->setName("overflow.res");
+
+  // BB overflow.no: jump to overflow.res BB
+  Builder.CreateBr(OverflowResBB);
+  // No we don't need the old terminator in overflow.entry BB, erase it:
+  OldTerminator->eraseFromParent();
+
+  // BB overflow.res:
+  Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt());
+  // Create PHI nodes to merge results from no.overflow BB and overflow BB to
+  // replace the extract instructions.
+  PHINode *OverflowResPHI = Builder.CreatePHI(Ty, 2),
+          *OverflowFlagPHI =
+              Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2);
+
+  // Add the incoming values from no.overflow BB and later from overflow BB.
+  OverflowResPHI->addIncoming(Mul, NoOverflowBB);
+  OverflowFlagPHI->addIncoming(ConstantInt::getFalse(I->getContext()),
+                               NoOverflowBB);
+
+  // Replace all users of MulExtract and OverflowExtract to use the PHI nodes.
+  if (MulExtract) {
+    MulExtract->replaceAllUsesWith(OverflowResPHI);
+    MulExtract->eraseFromParent();
+  }
+  if (OverflowExtract) {
+    OverflowExtract->replaceAllUsesWith(OverflowFlagPHI);
+    OverflowExtract->eraseFromParent();
+  }
+
+  // Remove the intrinsic from parent (overflow.res BB) as it will be part of
+  // overflow BB
+  I->removeFromParent();
+  // BB overflow:
+  I->insertInto(OverflowBB, OverflowBB->end());
+  Builder.SetInsertPoint(OverflowBB, OverflowBB->end());
+  Value *MulOverflow = Builder.CreateExtractValue(I, {0}, "mul.overflow");
+  Value *OverflowFlag = Builder.CreateExtractValue(I, {1}, "overflow.flag");
+  Builder.CreateBr(OverflowResBB);
+
+  // Add The Extracted values to the PHINodes in the overflow.res BB.
+  OverflowResPHI->addIncoming(MulOverflow, OverflowBB);
+  OverflowFlagPHI->addIncoming(OverflowFlag, OverflowBB);
+
+  ModifiedDT = ModifyDT::ModifyBBDT;
+  return true;
+}
+
 /// If there are any memory operands, use OptimizeMemoryInst to sink their
 /// address computing into the block when possible / profitable.
 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
diff --git a/llvm/lib/DWARFCFIChecker/DWARFCFIState.cpp b/llvm/lib/DWARFCFIChecker/DWARFCFIState.cpp
index 4d2d2da8a4445..2918ba1de652f 100644
--- a/llvm/lib/DWARFCFIChecker/DWARFCFIState.cpp
+++ b/llvm/lib/DWARFCFIChecker/DWARFCFIState.cpp
@@ -64,7 +64,6 @@ dwarf::CFIProgram DWARFCFIState::convert(MCCFIInstruction Directive) {
       /* CodeAlignmentFactor */ 1, /* DataAlignmentFactor */ 1,
       Context->getTargetTriple().getArch());
 
-  auto MaybeCurrentRow = getCurrentUnwindRow();
   switch (Directive.getOperation()) {
   case MCCFIInstruction::OpSameValue:
     CFIP.addInstruction(dwarf::DW_CFA_same_value, Directive.getRegister());
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt
index ab287c7af60be..6be59b0890c44 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_component_library(LLVMOrcDebugging
   BinaryFormat
   DebugInfoDWARF
   JITLink
+  Object
   OrcJIT
   OrcShared
   Support
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp
index 9f556b0d07a8b..653645ff03f15 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp
@@ -1,4 +1,4 @@
-//===------- ELFDebugObjectPlugin.cpp - JITLink debug objects ---------===//
+//===--------- ELFDebugObjectPlugin.cpp - JITLink debug objects -----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index cbce8bd736102..a3aa5e9571657 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -183,6 +183,23 @@ bool Constant::isMinSignedValue() const {
   return false;
 }
 
+bool Constant::isMaxSignedValue() const {
+  // Check for INT_MAX integers
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
+    return CI->isMaxValue(/*isSigned=*/true);
+
+  // Check for FP which are bitcasted from INT_MAX integers
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->getValueAPF().bitcastToAPInt().isMaxSignedValue();
+
+  // Check for splats of INT_MAX values.
+  if (getType()->isVectorTy())
+    if (const auto *SplatVal = getSplatValue())
+      return SplatVal->isMaxSignedValue();
+
+  return false;
+}
+
 bool Constant::isNotMinSignedValue() const {
   // Check for INT_MIN integers
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index 3330b70cdc2e1..939e9c6bf5d2f 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -127,8 +127,7 @@ static int WriteOutput(const TGParser &Parser, const char *argv0,
   return 0;
 }
 
-int llvm::TableGenMain(const char *argv0,
-                       std::function<MultiFileTableGenMainFn> MainFn) {
+int llvm::TableGenMain(const char *argv0, MultiFileTableGenMainFn MainFn) {
   RecordKeeper Records;
   TGTimer &Timer = Records.getTimer();
 
@@ -209,8 +208,7 @@ int llvm::TableGenMain(const char *argv0,
   return 0;
 }
 
-int llvm::TableGenMain(const char *argv0,
-                       std::function<TableGenMainFn> MainFn) {
+int llvm::TableGenMain(const char *argv0, TableGenMainFn MainFn) {
   return TableGenMain(argv0, [&MainFn](TableGenOutputFiles &OutFiles,
                                        const RecordKeeper &Records) {
     std::string S;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index f1db05dda4e40..08466667c0fa5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -4403,43 +4403,46 @@ bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm) {
 
 bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm,
                                               bool Invert) {
-  if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
-    uint64_t ImmVal = CNode->getZExtValue();
-    SDLoc DL(N);
-
-    if (Invert)
-      ImmVal = ~ImmVal;
+  uint64_t ImmVal;
+  if (auto CI = dyn_cast<ConstantSDNode>(N))
+    ImmVal = CI->getZExtValue();
+  else if (auto CFP = dyn_cast<ConstantFPSDNode>(N))
+    ImmVal = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+  else
+    return false;
 
-    // Shift mask depending on type size.
-    switch (VT.SimpleTy) {
-    case MVT::i8:
-      ImmVal &= 0xFF;
-      ImmVal |= ImmVal << 8;
-      ImmVal |= ImmVal << 16;
-      ImmVal |= ImmVal << 32;
-      break;
-    case MVT::i16:
-      ImmVal &= 0xFFFF;
-      ImmVal |= ImmVal << 16;
-      ImmVal |= ImmVal << 32;
-      break;
-    case MVT::i32:
-      ImmVal &= 0xFFFFFFFF;
-      ImmVal |= ImmVal << 32;
-      break;
-    case MVT::i64:
-      break;
-    default:
-      llvm_unreachable("Unexpected type");
-    }
+  if (Invert)
+    ImmVal = ~ImmVal;
 
-    uint64_t encoding;
-    if (AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding)) {
-      Imm = CurDAG->getTargetConstant(encoding, DL, MVT::i64);
-      return true;
-    }
+  // Shift mask depending on type size.
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    ImmVal &= 0xFF;
+    ImmVal |= ImmVal << 8;
+    ImmVal |= ImmVal << 16;
+    ImmVal |= ImmVal << 32;
+    break;
+  case MVT::i16:
+    ImmVal &= 0xFFFF;
+    ImmVal |= ImmVal << 16;
+    ImmVal |= ImmVal << 32;
+    break;
+  case MVT::i32:
+    ImmVal &= 0xFFFFFFFF;
+    ImmVal |= ImmVal << 32;
+    break;
+  case MVT::i64:
+    break;
+  default:
+    llvm_unreachable("Unexpected type");
   }
-  return false;
+
+  uint64_t encoding;
+  if (!AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding))
+    return false;
+
+  Imm = CurDAG->getTargetConstant(encoding, SDLoc(N), MVT::i64);
+  return true;
 }
 
 // SVE shift intrinsics allow shift amounts larger than the element's bitwidth.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 42567883b2594..d21e19b2ecd46 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18851,6 +18851,15 @@ bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   return (Index == 0 || Index == ResVT.getVectorMinNumElements());
 }
 
+bool AArch64TargetLowering::shouldOptimizeMulOverflowWithZeroHighBits(
+    LLVMContext &Context, EVT VT) const {
+  if (getTypeAction(Context, VT) != TypeExpandInteger)
+    return false;
+
+  EVT LegalTy = EVT::getIntegerVT(Context, VT.getSizeInBits() / 2);
+  return getTypeAction(Context, LegalTy) == TargetLowering::TypeLegal;
+}
+
 /// Turn vector tests of the signbit in the form of:
 ///   xor (sra X, elt_size(X)-1), -1
 /// into:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 70bfae717fb76..be198e54cbcbf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -333,6 +333,11 @@ class AArch64TargetLowering : public TargetLowering {
     return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
   }
 
+  // Return true if the target wants to optimize the mul overflow intrinsic
+  // for the given \p VT.
+  bool shouldOptimizeMulOverflowWithZeroHighBits(LLVMContext &Context,
+                                                 EVT VT) const override;
+
   Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index c8c21c4822ffe..e99b3f8ff07e0 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -989,7 +989,7 @@ let Predicates = [HasSVE_or_SME] in {
             (DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>;
 
   // Duplicate FP immediate into all vector elements
-  let AddedComplexity = 2 in {
+  let AddedComplexity = 3 in {
     def : Pat<(nxv8f16 (splat_vector fpimm16:$imm8)),
               (FDUP_ZI_H fpimm16:$imm8)>;
     def : Pat<(nxv4f16 (splat_vector fpimm16:$imm8)),
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 1664f4ad0c8fa..1e771e1fb9403 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -347,6 +347,11 @@ def SVELogicalImm16Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16>",
 def SVELogicalImm32Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32>", []>;
 def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>", []>;
 
+def SVELogicalFPImm16Pat : ComplexPattern<f16, 1, "SelectSVELogicalImm<MVT::i16>", []>;
+def SVELogicalFPImm32Pat : ComplexPattern<f32, 1, "SelectSVELogicalImm<MVT::i32>", []>;
+def SVELogicalFPImm64Pat : ComplexPattern<f64, 1, "SelectSVELogicalImm<MVT::i64>", []>;
+def SVELogicalBFPImmPat : ComplexPattern<bf16, 1, "SelectSVELogicalImm<MVT::i16>", []>;
+
 def SVELogicalImm8NotPat  : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i8, true>", []>;
 def SVELogicalImm16NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16, true>", []>;
 def SVELogicalImm32NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32, true>", []>;
@@ -2160,6 +2165,26 @@ multiclass sve_int_dup_mask_imm<string asm> {
             (!cast<Instruction>(NAME) i64:$imm)>;
   def : Pat<(nxv2i64 (splat_vector (i64 (SVELogicalImm64Pat i64:$imm)))),
             (!cast<Instruction>(NAME) i64:$imm)>;
+
+  def : Pat<(nxv8f16 (splat_vector (f16 (SVELogicalFPImm16Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv4f16 (splat_vector (f16 (SVELogicalFPImm16Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv2f16 (splat_vector (f16 (SVELogicalFPImm16Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv4f32 (splat_vector (f32 (SVELogicalFPImm32Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv2f32 (splat_vector (f32 (SVELogicalFPImm32Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv2f64 (splat_vector (f64 (SVELogicalFPImm64Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+
+  def : Pat<(nxv8bf16 (splat_vector (bf16 (SVELogicalBFPImmPat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv4bf16 (splat_vector (bf16 (SVELogicalBFPImmPat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv2bf16 (splat_vector (bf16 (SVELogicalBFPImmPat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 9af812960542c..b7078825928be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -314,9 +314,7 @@ class SplitGraph {
 #endif
 
   bool empty() const { return Nodes.empty(); }
-  const iterator_range<nodes_iterator> nodes() const {
-    return {Nodes.begin(), Nodes.end()};
-  }
+  iterator_range<nodes_iterator> nodes() const { return Nodes; }
   const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
 
   unsigned getNumNodes() const { return Nodes.size(); }
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 67437f6969b27..8b2866260e9c9 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -930,6 +930,24 @@ def Discard : DXILOp<82, discard> {
   let stages = [Stages<DXIL1_0, [pixel]>];
 }
 
+def DerivCoarseX : DXILOp<83, unary> {
+  let Doc = "computes the rate of change per stamp in x direction";
+  let intrinsics = [IntrinSelect<int_dx_ddx_coarse>];
+  let arguments = [OverloadTy];
+  let result = OverloadTy;
+  let overloads = [Overloads<DXIL1_0, [HalfTy, FloatTy]>];
+  let stages = [Stages<DXIL1_0, [library, pixel]>];
+}
+
+def DerivCoarseY : DXILOp<84, unary> {
+  let Doc = "computes the rate of change per stamp in y direction";
+  let intrinsics = [IntrinSelect<int_dx_ddy_coarse>];
+  let arguments = [OverloadTy];
+  let result = OverloadTy;
+  let overloads = [Overloads<DXIL1_0, [HalfTy, FloatTy]>];
+  let stages = [Stages<DXIL1_0, [library, pixel]>];
+}
+
 def ThreadId : DXILOp<93, threadId> {
   let Doc = "Reads the thread ID";
   let intrinsics = [IntrinSelect<int_dx_thread_id>];
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 6cacbf6564db2..a755dd522969d 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -64,6 +64,8 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_wave_reduce_usum:
   case Intrinsic::dx_imad:
   case Intrinsic::dx_umad:
+  case Intrinsic::dx_ddx_coarse:
+  case Intrinsic::dx_ddy_coarse:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 47022b3f89a8b..76fd834fd7219 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -1697,11 +1697,16 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(unsigned BitWidth,
   MachineIRBuilder MIRBuilder(DepMBB, DepMBB.getFirstNonPHI());
   const MachineInstr *NewMI =
       createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
-        return BuildMI(MIRBuilder.getMBB(), *MIRBuilder.getInsertPt(),
-                       MIRBuilder.getDL(), TII.get(SPIRVOPcode))
-            .addDef(createTypeVReg(CurMF->getRegInfo()))
-            .addImm(BitWidth)
-            .addImm(0);
+        auto NewTypeMI = BuildMI(MIRBuilder.getMBB(), *MIRBuilder.getInsertPt(),
+                                 MIRBuilder.getDL(), TII.get(SPIRVOPcode))
+                             .addDef(createTypeVReg(CurMF->getRegInfo()))
+                             .addImm(BitWidth);
+        // Don't add Encoding to FP type
+        if (!Ty->isFloatTy()) {
+          return NewTypeMI.addImm(0);
+        } else {
+          return NewTypeMI;
+        }
       });
   add(Ty, false, NewMI);
   return finishCreatingSPIRVType(Ty, NewMI);
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index fc87288a4a212..0653b4eb9dfe2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -328,6 +328,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
                            MachineInstr &I) const;
   bool selectFrexp(Register ResVReg, const SPIRVType *ResType,
                    MachineInstr &I) const;
+  bool selectDpdCoarse(Register ResVReg, const SPIRVType *ResType,
+                       MachineInstr &I, const unsigned DPdOpCode) const;
   // Utilities
   std::pair<Register, bool>
   buildI32Constant(uint32_t Val, MachineInstr &I,
@@ -371,6 +373,7 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool loadHandleBeforePosition(Register &HandleReg, const SPIRVType *ResType,
                                 GIntrinsic &HandleDef, MachineInstr &Pos) const;
   void decorateUsesAsNonUniform(Register &NonUniformReg) const;
+  void errorIfInstrOutsideShader(MachineInstr &I) const;
 };
 
 bool sampledTypeIsSignedInteger(const llvm::Type *HandleType) {
@@ -3140,6 +3143,58 @@ bool SPIRVInstructionSelector::wrapIntoSpecConstantOp(
   return Result;
 }
 
+bool SPIRVInstructionSelector::selectDpdCoarse(Register ResVReg,
+                                               const SPIRVType *ResType,
+                                               MachineInstr &I,
+                                               const unsigned DPdOpCode) const {
+  // TODO: This should check specifically for Fragment Execution Model, but STI
+  // doesn't provide that information yet. See #167562
+  errorIfInstrOutsideShader(I);
+
+  // If the arg/result types are half then we need to wrap the instr in
+  // conversions to float
+  // This case occurs because a half arg/result is legal in HLSL but not spirv.
+  Register SrcReg = I.getOperand(2).getReg();
+  SPIRVType *SrcType = GR.getSPIRVTypeForVReg(SrcReg);
+  unsigned BitWidth = std::min(GR.getScalarOrVectorBitWidth(SrcType),
+                               GR.getScalarOrVectorBitWidth(ResType));
+  if (BitWidth == 32)
+    return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(DPdOpCode))
+        .addDef(ResVReg)
+        .addUse(GR.getSPIRVTypeID(ResType))
+        .addUse(I.getOperand(2).getReg());
+
+  MachineIRBuilder MIRBuilder(I);
+  unsigned componentCount = GR.getScalarOrVectorComponentCount(SrcType);
+  SPIRVType *F32ConvertTy = GR.getOrCreateSPIRVFloatType(32, I, TII);
+  if (componentCount != 1)
+    F32ConvertTy = GR.getOrCreateSPIRVVectorType(F32ConvertTy, componentCount,
+                                                 MIRBuilder, false);
+
+  const TargetRegisterClass *RegClass = GR.getRegClass(SrcType);
+  Register ConvertToVReg = MRI->createVirtualRegister(RegClass);
+  Register DpdOpVReg = MRI->createVirtualRegister(RegClass);
+
+  bool Result =
+      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpFConvert))
+          .addDef(ConvertToVReg)
+          .addUse(GR.getSPIRVTypeID(F32ConvertTy))
+          .addUse(SrcReg)
+          .constrainAllUses(TII, TRI, RBI);
+  Result &= BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(DPdOpCode))
+                .addDef(DpdOpVReg)
+                .addUse(GR.getSPIRVTypeID(F32ConvertTy))
+                .addUse(ConvertToVReg)
+                .constrainAllUses(TII, TRI, RBI);
+  Result &=
+      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpFConvert))
+          .addDef(ResVReg)
+          .addUse(GR.getSPIRVTypeID(ResType))
+          .addUse(DpdOpVReg)
+          .constrainAllUses(TII, TRI, RBI);
+  return Result;
+}
+
 bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
                                                const SPIRVType *ResType,
                                                MachineInstr &I) const {
@@ -3528,7 +3583,12 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   case Intrinsic::spv_unpackhalf2x16: {
     return selectExtInst(ResVReg, ResType, I, GL::UnpackHalf2x16);
   }
-
+  case Intrinsic::spv_ddx_coarse: {
+    return selectDpdCoarse(ResVReg, ResType, I, SPIRV::OpDPdxCoarse);
+  }
+  case Intrinsic::spv_ddy_coarse: {
+    return selectDpdCoarse(ResVReg, ResType, I, SPIRV::OpDPdyCoarse);
+  }
   default: {
     std::string DiagMsg;
     raw_string_ostream OS(DiagMsg);
@@ -4694,6 +4754,17 @@ bool SPIRVInstructionSelector::loadHandleBeforePosition(
       .constrainAllUses(TII, TRI, RBI);
 }
 
+void SPIRVInstructionSelector::errorIfInstrOutsideShader(
+    MachineInstr &I) const {
+  if (!STI.isShader()) {
+    std::string DiagMsg;
+    raw_string_ostream OS(DiagMsg);
+    I.print(OS, true, false, false, false);
+    DiagMsg += " is only supported in shaders.\n";
+    report_fatal_error(DiagMsg.c_str(), false);
+  }
+}
+
 namespace llvm {
 InstructionSelector *
 createSPIRVInstructionSelector(const SPIRVTargetMachine &TM,
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index b8cd9c1358f00..bd754d17694b8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -934,7 +934,8 @@ void RequirementHandler::initAvailableCapabilitiesForVulkan(
                     Capability::UniformBufferArrayDynamicIndexing,
                     Capability::SampledImageArrayDynamicIndexing,
                     Capability::StorageBufferArrayDynamicIndexing,
-                    Capability::StorageImageArrayDynamicIndexing});
+                    Capability::StorageImageArrayDynamicIndexing,
+                    Capability::DerivativeControl});
 
   // Became core in Vulkan 1.2
   if (ST.isAtLeastSPIRVVer(VersionTuple(1, 5))) {
@@ -2148,6 +2149,12 @@ void addInstrRequirements(const MachineInstr &MI,
     }
     break;
   }
+  case SPIRV::OpDPdxCoarse:
+  case SPIRV::OpDPdyCoarse: {
+    Reqs.addCapability(SPIRV::Capability::DerivativeControl);
+    break;
+  }
+
   default:
     break;
   }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 621f1868d3311..864e5dc67682c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54688,11 +54688,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
       KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
       // Check the shift amount is byte aligned.
       // Check the truncation doesn't use any shifted in (zero) top bits.
-      // Check the shift amount doesn't depend on the original load.
+      // Check the shift amount doesn't depend on the original load chain.
       if (KnownAmt.countMinTrailingZeros() >= 3 &&
           KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
                                      VT.getSizeInBits()) &&
-          !Ld->isPredecessorOf(ShAmt.getNode())) {
+          none_of(Ld->uses(), [&ShAmt](SDUse &Use) {
+            return Use.getResNo() == 1 &&
+                   Use.getUser()->isPredecessorOf(ShAmt.getNode());
+          })) {
         EVT PtrVT = Ld->getBasePtr().getValueType();
         SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
         SDValue PtrByteOfs =
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 400efa94789d3..7df6bf15f9fdd 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -248,6 +248,11 @@ static cl::opt<bool>
                          "platforms that support this"),
                 cl::Hidden, cl::init(true));
 
+static cl::opt<int>
+    ClShadowAddrSpace("asan-shadow-addr-space",
+                      cl::desc("Address space for pointers to the shadow map"),
+                      cl::Hidden, cl::init(0));
+
 static cl::opt<bool> ClWithIfuncSuppressRemat(
     "asan-with-ifunc-suppress-remat",
     cl::desc("Suppress rematerialization of dynamic shadow address by passing "
@@ -503,6 +508,7 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
   bool IsAMDGPU = TargetTriple.isAMDGPU();
   bool IsHaiku = TargetTriple.isOSHaiku();
   bool IsWasm = TargetTriple.isWasm();
+  bool IsBPF = TargetTriple.isBPF();
 
   ShadowMapping Mapping;
 
@@ -579,6 +585,8 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
     else if (IsHaiku && IsX86_64)
       Mapping.Offset = (kSmallX86_64ShadowOffsetBase &
                         (kSmallX86_64ShadowOffsetAlignMask << Mapping.Scale));
+    else if (IsBPF)
+      Mapping.Offset = kDynamicShadowSentinel;
     else
       Mapping.Offset = kDefaultShadowOffset64;
   }
@@ -1942,7 +1950,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   Type *ShadowTy =
       IntegerType::get(*C, std::max(8U, TypeStoreSize >> Mapping.Scale));
-  Type *ShadowPtrTy = PointerType::get(*C, 0);
+  Type *ShadowPtrTy = PointerType::get(*C, ClShadowAddrSpace);
   Value *ShadowPtr = memToShadow(AddrLong, IRB);
   const uint64_t ShadowAlign =
       std::max<uint64_t>(Alignment.valueOrOne().value() >> Mapping.Scale, 1);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index f533a47150a7b..741392247c0d6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -152,11 +152,12 @@ class VPBuilder {
   /// its underlying Instruction.
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               Instruction *Inst = nullptr,
+                              const VPIRFlags &Flags = {},
                               const VPIRMetadata &MD = {},
                               DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
     VPInstruction *NewVPInst = tryInsertInstruction(
-        new VPInstruction(Opcode, Operands, {}, MD, DL, Name));
+        new VPInstruction(Opcode, Operands, Flags, MD, DL, Name));
     NewVPInst->setUnderlyingValue(Inst);
     return NewVPInst;
   }
@@ -329,7 +330,7 @@ class VPBuilder {
     else if (Opcode == Instruction::ZExt)
       Flags = VPIRFlags::NonNegFlagsTy(false);
     return tryInsertInstruction(
-        new VPWidenCastRecipe(Opcode, Op, ResultTy, Flags));
+        new VPWidenCastRecipe(Opcode, Op, ResultTy, nullptr, Flags));
   }
 
   VPScalarIVStepsRecipe *
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 356d759b94799..c680b6fca84cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7750,7 +7750,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
                 },
                 Range);
   if (ShouldUseVectorIntrinsic)
-    return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI,
+    return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI, *VPI,
                                       VPI->getDebugLoc());
 
   Function *Variant = nullptr;
@@ -7804,7 +7804,8 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
     }
 
     Ops.push_back(VPI->getOperand(VPI->getNumOperands() - 1));
-    return new VPWidenCallRecipe(CI, Variant, Ops, VPI->getDebugLoc());
+    return new VPWidenCallRecipe(CI, Variant, Ops, *VPI, *VPI,
+                                 VPI->getDebugLoc());
   }
 
   return nullptr;
@@ -7842,7 +7843,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
       auto *SafeRHS =
           Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
       Ops[1] = SafeRHS;
-      return new VPWidenRecipe(*I, Ops, *VPI, VPI->getDebugLoc());
+      return new VPWidenRecipe(*I, Ops, *VPI, *VPI, VPI->getDebugLoc());
     }
     [[fallthrough]];
   }
@@ -7888,7 +7889,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
       // For other binops, the legacy cost model only checks the second operand.
       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
     }
-    return new VPWidenRecipe(*I, NewOps, *VPI, VPI->getDebugLoc());
+    return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
   }
   case Instruction::ExtractValue: {
     SmallVector<VPValue *> NewOps(VPI->operands());
@@ -7896,7 +7897,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
     assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
     unsigned Idx = EVI->getIndices()[0];
     NewOps.push_back(Plan.getConstantInt(32, Idx));
-    return new VPWidenRecipe(*I, NewOps, *VPI, VPI->getDebugLoc());
+    return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
   }
   };
 }
@@ -7981,7 +7982,8 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
          "Should not predicate a uniform recipe");
   auto *Recipe =
-      new VPReplicateRecipe(I, VPI->operands(), IsUniform, BlockInMask, *VPI);
+      new VPReplicateRecipe(I, VPI->operands(), IsUniform, BlockInMask, *VPI,
+                            *VPI, VPI->getDebugLoc());
   return Recipe;
 }
 
@@ -8231,17 +8233,19 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
     return nullptr;
 
   if (VPI->getOpcode() == Instruction::GetElementPtr)
-    return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands());
+    return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands(),
+                                *VPI, VPI->getDebugLoc());
 
   if (VPI->getOpcode() == Instruction::Select)
-    return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands(),
-                                   *VPI);
+    return new VPWidenSelectRecipe(cast<SelectInst>(Instr), R->operands(), *VPI,
+                                   *VPI, VPI->getDebugLoc());
 
   if (Instruction::isCast(VPI->getOpcode())) {
-    auto *CastR = cast<VPInstructionWithType>(R);
     auto *CI = cast<CastInst>(Instr);
+    auto *CastR = cast<VPInstructionWithType>(VPI);
     return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
-                                 CastR->getResultType(), *CI, *VPI);
+                                 CastR->getResultType(), CI, *VPI, *VPI,
+                                 VPI->getDebugLoc());
   }
 
   return tryToWiden(VPI);
@@ -8269,8 +8273,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
     SmallVector<VPValue *, 2> Ops;
     Ops.push_back(Plan.getOrAddLiveIn(Zero));
     Ops.push_back(BinOp);
-    BinOp = new VPWidenRecipe(*ReductionI, Ops, VPIRMetadata(),
-                              ReductionI->getDebugLoc());
+    BinOp = new VPWidenRecipe(*ReductionI, Ops, VPIRFlags(*ReductionI),
+                              VPIRMetadata(), ReductionI->getDebugLoc());
     Builder.insert(BinOp->getDefiningRecipe());
     ReductionOpcode = Instruction::Add;
   }
@@ -8454,9 +8458,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
         // Only create recipe for the final invariant store of the reduction.
         if (Legal->isInvariantStoreOfReduction(SI)) {
+          auto *VPI = cast<VPInstruction>(SingleDef);
           auto *Recipe = new VPReplicateRecipe(
-              SI, R.operands(), true /* IsUniform */, nullptr /*Mask*/,
-              *cast<VPInstruction>(SingleDef));
+              SI, R.operands(), true /* IsUniform */, nullptr /*Mask*/, *VPI,
+              *VPI, VPI->getDebugLoc());
           Recipe->insertBefore(*MiddleVPBB, MBIP);
         }
         R.eraseFromParent();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fc29ab0c84093..fedbcfb6bd32a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -882,14 +882,6 @@ class VPIRFlags {
 /// A pure-virtual common base class for recipes defining a single VPValue and
 /// using IR flags.
 struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
-  VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
-                      DebugLoc DL = DebugLoc::getUnknown())
-      : VPSingleDefRecipe(SC, Operands, DL), VPIRFlags() {}
-
-  VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
-                      Instruction &I)
-      : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()), VPIRFlags(I) {}
-
   VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
                       const VPIRFlags &Flags,
                       DebugLoc DL = DebugLoc::getUnknown())
@@ -1474,9 +1466,12 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
         VPIRMetadata(Metadata), Opcode(Opcode) {}
 
   VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands,
-                const VPIRMetadata &Metadata, DebugLoc DL)
-      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
-        VPIRMetadata(Metadata), Opcode(I.getOpcode()) {}
+                const VPIRFlags &Flags = {}, const VPIRMetadata &Metadata = {},
+                DebugLoc DL = {})
+      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL),
+        VPIRMetadata(Metadata), Opcode(I.getOpcode()) {
+    setUnderlyingValue(&I);
+  }
 
   ~VPWidenRecipe() override = default;
 
@@ -1517,30 +1512,22 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
 
 public:
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
-                    CastInst &UI, const VPIRMetadata &Metadata)
-      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI),
-        VPIRMetadata(Metadata), Opcode(Opcode), ResultTy(ResultTy) {
-    assert(UI.getOpcode() == Opcode &&
-           "opcode of underlying cast doesn't match");
-  }
-  VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
-                    const VPIRFlags &Flags = {},
+                    CastInst *CI = nullptr, const VPIRFlags &Flags = {},
                     const VPIRMetadata &Metadata = {},
                     DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, Flags, DL),
         VPIRMetadata(Metadata), Opcode(Opcode), ResultTy(ResultTy) {
     assert(flagsValidForOpcode(Opcode) &&
            "Set flags not supported for the provided opcode");
+    setUnderlyingValue(CI);
   }
 
   ~VPWidenCastRecipe() override = default;
 
   VPWidenCastRecipe *clone() override {
-    auto *New = new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy, *this,
-                                      *this, getDebugLoc());
-    if (auto *UV = getUnderlyingValue())
-      New->setUnderlyingValue(UV);
-    return New;
+    return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy,
+                                 cast_or_null<CastInst>(getUnderlyingValue()),
+                                 *this, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenCastSC)
@@ -1585,13 +1572,17 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
 public:
   VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
+                         const VPIRFlags &Flags = {},
                          const VPIRMetadata &MD = {},
                          DebugLoc DL = DebugLoc::getUnknown())
-      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI),
+      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, Flags,
+                            DL),
         VPIRMetadata(MD), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
         MayReadFromMemory(CI.mayReadFromMemory()),
         MayWriteToMemory(CI.mayWriteToMemory()),
-        MayHaveSideEffects(CI.mayHaveSideEffects()) {}
+        MayHaveSideEffects(CI.mayHaveSideEffects()) {
+    setUnderlyingValue(&CI);
+  }
 
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
@@ -1617,7 +1608,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
   VPWidenIntrinsicRecipe *clone() override {
     if (Value *CI = getUnderlyingValue())
       return new VPWidenIntrinsicRecipe(*cast<CallInst>(CI), VectorIntrinsicID,
-                                        operands(), ResultTy, *this,
+                                        operands(), ResultTy, *this, *this,
                                         getDebugLoc());
     return new VPWidenIntrinsicRecipe(VectorIntrinsicID, operands(), ResultTy,
                                       *this, *this, getDebugLoc());
@@ -1671,10 +1662,11 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags,
 public:
   VPWidenCallRecipe(Value *UV, Function *Variant,
                     ArrayRef<VPValue *> CallArguments,
-                    DebugLoc DL = DebugLoc::getUnknown())
-      : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments,
-                            *cast<Instruction>(UV)),
-        VPIRMetadata(*cast<Instruction>(UV)), Variant(Variant) {
+                    const VPIRFlags &Flags = {},
+                    const VPIRMetadata &Metadata = {}, DebugLoc DL = {})
+      : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments, Flags, DL),
+        VPIRMetadata(Metadata), Variant(Variant) {
+    setUnderlyingValue(UV);
     assert(
         isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
         "last operand must be the called function");
@@ -1684,7 +1676,7 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags,
 
   VPWidenCallRecipe *clone() override {
     return new VPWidenCallRecipe(getUnderlyingValue(), Variant, operands(),
-                                 getDebugLoc());
+                                 *this, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
@@ -1761,16 +1753,19 @@ class VPHistogramRecipe : public VPRecipeBase {
 /// instruction.
 struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
                                                public VPIRMetadata {
-  VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands,
-                      const VPIRMetadata &MD = {})
-      : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, I),
-        VPIRMetadata(MD) {}
+  VPWidenSelectRecipe(SelectInst *SI, ArrayRef<VPValue *> Operands,
+                      const VPIRFlags &Flags = {}, const VPIRMetadata &MD = {},
+                      DebugLoc DL = {})
+      : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, Flags, DL),
+        VPIRMetadata(MD) {
+    setUnderlyingValue(SI);
+  }
 
   ~VPWidenSelectRecipe() override = default;
 
   VPWidenSelectRecipe *clone() override {
-    return new VPWidenSelectRecipe(*cast<SelectInst>(getUnderlyingInstr()),
-                                   operands(), *this);
+    return new VPWidenSelectRecipe(cast<SelectInst>(getUnderlyingInstr()),
+                                   operands(), *this, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
@@ -1822,9 +1817,12 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
   }
 
 public:
-  VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef<VPValue *> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP),
+  VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef<VPValue *> Operands,
+                   const VPIRFlags &Flags = {},
+                   DebugLoc DL = DebugLoc::getUnknown())
+      : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, Flags, DL),
         SourceElementTy(GEP->getSourceElementType()) {
+    setUnderlyingValue(GEP);
     SmallVector<std::pair<unsigned, MDNode *>> Metadata;
     (void)Metadata;
     getMetadataToPropagate(GEP, Metadata);
@@ -1835,7 +1833,7 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
 
   VPWidenGEPRecipe *clone() override {
     return new VPWidenGEPRecipe(cast<GetElementPtrInst>(getUnderlyingInstr()),
-                                operands());
+                                operands(), *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC)
@@ -2929,10 +2927,12 @@ class LLVM_ABI_FOR_TEST VPReplicateRecipe : public VPRecipeWithIRFlags,
 public:
   VPReplicateRecipe(Instruction *I, ArrayRef<VPValue *> Operands,
                     bool IsSingleScalar, VPValue *Mask = nullptr,
-                    VPIRMetadata Metadata = {})
-      : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I),
+                    const VPIRFlags &Flags = {}, VPIRMetadata Metadata = {},
+                    DebugLoc DL = DebugLoc::getUnknown())
+      : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, Flags, DL),
         VPIRMetadata(Metadata), IsSingleScalar(IsSingleScalar),
         IsPredicated(Mask) {
+    setUnderlyingValue(I);
     if (Mask)
       addOperand(Mask);
   }
@@ -2940,9 +2940,9 @@ class LLVM_ABI_FOR_TEST VPReplicateRecipe : public VPRecipeWithIRFlags,
   ~VPReplicateRecipe() override = default;
 
   VPReplicateRecipe *clone() override {
-    auto *Copy =
-        new VPReplicateRecipe(getUnderlyingInstr(), operands(), IsSingleScalar,
-                              isPredicated() ? getMask() : nullptr, *this);
+    auto *Copy = new VPReplicateRecipe(
+        getUnderlyingInstr(), operands(), IsSingleScalar,
+        isPredicated() ? getMask() : nullptr, *this, *this, getDebugLoc());
     Copy->transferFlags(*this);
     return Copy;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 612202d049774..dbbde1cafa9f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -190,7 +190,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       // recipes.
       if (Br->isConditional()) {
         VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
-        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst,
+        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst, {},
                                  VPIRMetadata(*Inst), Inst->getDebugLoc());
       }
 
@@ -205,7 +205,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
       for (auto Case : SI->cases())
         Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
-      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst,
+      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst, {},
                                VPIRMetadata(*Inst), Inst->getDebugLoc());
       continue;
     }
@@ -255,13 +255,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       if (auto *CI = dyn_cast<CastInst>(Inst)) {
         NewR = VPIRBuilder.createScalarCast(CI->getOpcode(), VPOperands[0],
                                             CI->getType(), CI->getDebugLoc(),
-                                            {}, MD);
+                                            VPIRFlags(*CI), MD);
         NewR->setUnderlyingValue(CI);
       } else {
         // Build VPInstruction for any arbitrary Instruction without specific
         // representation in VPlan.
-        NewR = VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst, MD,
-                                        Inst->getDebugLoc());
+        NewR =
+            VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst,
+                                     VPIRFlags(*Inst), MD, Inst->getDebugLoc());
       }
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index fca6554ad77c6..ef36e29aaa5c4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2056,24 +2056,26 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
   switch (OpType) {
   case OperationType::OverflowingBinOp:
     return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
-           Opcode == Instruction::Mul ||
+           Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
            Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
   case OperationType::Trunc:
     return Opcode == Instruction::Trunc;
   case OperationType::DisjointOp:
     return Opcode == Instruction::Or;
   case OperationType::PossiblyExactOp:
-    return Opcode == Instruction::AShr;
+    return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
+           Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
   case OperationType::GEPOp:
     return Opcode == Instruction::GetElementPtr ||
            Opcode == VPInstruction::PtrAdd ||
            Opcode == VPInstruction::WidePtrAdd;
   case OperationType::FPMathOp:
-    return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
-           Opcode == Instruction::FSub || Opcode == Instruction::FNeg ||
-           Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
-           Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc ||
-           Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
+    return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
+           Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
+           Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
+           Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
+           Opcode == Instruction::FPTrunc || Opcode == Instruction::FCmp ||
+           Opcode == Instruction::Select ||
            Opcode == VPInstruction::WideIVStep ||
            Opcode == VPInstruction::ReductionStartVector ||
            Opcode == VPInstruction::ComputeReductionResult;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 26563242de283..25557f1d5d651 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -104,24 +104,26 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
               Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
-          NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
+          NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
+                                           Ingredient.getDebugLoc());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
           Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
           if (VectorID == Intrinsic::not_intrinsic)
             return false;
           NewRecipe = new VPWidenIntrinsicRecipe(
               *CI, getVectorIntrinsicIDForCall(CI, &TLI),
-              drop_end(Ingredient.operands()), CI->getType(), *VPI,
-              CI->getDebugLoc());
+              drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
+              *VPI, CI->getDebugLoc());
         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
-          NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands(), *VPI);
+          NewRecipe = new VPWidenSelectRecipe(SI, Ingredient.operands(), *VPI,
+                                              *VPI, Ingredient.getDebugLoc());
         } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
-          NewRecipe =
-              new VPWidenCastRecipe(CI->getOpcode(), Ingredient.getOperand(0),
-                                    CI->getType(), *CI, *VPI);
+          NewRecipe = new VPWidenCastRecipe(
+              CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
+              VPIRFlags(*CI), VPIRMetadata(*CI));
         } else {
           NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
-                                        Ingredient.getDebugLoc());
+                                        *VPI, Ingredient.getDebugLoc());
         }
       }
 
@@ -226,7 +228,8 @@ static bool sinkScalarOperands(VPlan &Plan) {
         // then cloning should be sufficient here.
         Instruction *I = SinkCandidate->getUnderlyingInstr();
         Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
-                                      nullptr /*Mask*/, *SinkCandidateRepR);
+                                      nullptr /*Mask*/, *SinkCandidateRepR,
+                                      *SinkCandidateRepR);
         // TODO: add ".cloned" suffix to name of Clone's VPValue.
       } else {
         Clone = SinkCandidate->clone();
@@ -385,7 +388,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
   // mask but in the replicate region.
   auto *RecipeWithoutMask = new VPReplicateRecipe(
       PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
-      PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
+      PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
+      PredRecipe->getDebugLoc());
   auto *Pred =
       Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
 
@@ -691,7 +695,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
     // analysis.
     auto Users = collectUsersRecursively(PhiR);
     for (VPUser *U : reverse(Users)) {
-      auto *Def = dyn_cast<VPSingleDefRecipe>(U);
+      auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
       auto *RepR = dyn_cast<VPReplicateRecipe>(U);
       // Skip recipes that shouldn't be narrowed.
       if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
@@ -704,7 +708,8 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
         continue;
 
       auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
-                                          Def->operands(), /*IsUniform*/ true);
+                                          Def->operands(), /*IsUniform*/ true,
+                                          /*Mask*/ nullptr, /*Flags*/ *Def);
       Clone->insertAfter(Def);
       Def->replaceAllUsesWith(Clone);
     }
@@ -1423,12 +1428,13 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
         continue;
 
-      auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
+      auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
       if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
           vputils::isSingleScalar(RepR->getOperand(1))) {
         auto *Clone = new VPReplicateRecipe(
             RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
-            true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
+            true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
+            *RepR /*Metadata*/, RepR->getDebugLoc());
         Clone->insertBefore(RepOrWidenR);
         unsigned ExtractOpc =
             vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
@@ -1469,9 +1475,9 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
           }))
         continue;
 
-      auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
-                                          RepOrWidenR->operands(),
-                                          true /*IsSingleScalar*/);
+      auto *Clone = new VPReplicateRecipe(
+          RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
+          true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
       Clone->insertBefore(RepOrWidenR);
       RepOrWidenR->replaceAllUsesWith(Clone);
       if (isDeadRecipe(*RepOrWidenR))
@@ -3824,15 +3830,15 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         Ext0->getOpcode() == Ext1->getOpcode() &&
         IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
       auto *NewExt0 = new VPWidenCastRecipe(
-          Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
-          *Ext0, Ext0->getDebugLoc());
+          Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
+          *Ext0, *Ext0, Ext0->getDebugLoc());
       NewExt0->insertBefore(Ext0);
 
       VPWidenCastRecipe *NewExt1 = NewExt0;
       if (Ext0 != Ext1) {
         NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
-                                        Ext->getResultType(), *Ext1, *Ext1,
-                                        Ext1->getDebugLoc());
+                                        Ext->getResultType(), nullptr, *Ext1,
+                                        *Ext1, Ext1->getDebugLoc());
         NewExt1->insertBefore(Ext1);
       }
       Mul->setOperand(0, NewExt0);
@@ -4353,7 +4359,7 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
   // process one original iteration.
   auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
                                   /*IsUniform*/ true,
-                                  /*Mask*/ nullptr, *WideLoad);
+                                  /*Mask*/ nullptr, {}, *WideLoad);
   N->insertBefore(WideLoad);
   NarrowedOps.insert(N);
   return N;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index d4b8b72beb942..d76d2ed5f1c76 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -518,9 +518,9 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
     // TODO: have cloning of replicate recipes also provide the desired result
     // coupled with setting its operands to NewOps (deriving IsSingleScalar and
     // Mask from the operands?)
-    New =
-        new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
-                              /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
+    New = new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
+                                /*IsSingleScalar=*/true, /*Mask=*/nullptr,
+                                *RepR, *RepR, RepR->getDebugLoc());
   } else {
     assert(isa<VPInstruction>(DefR) &&
            "DefR must be a VPReplicateRecipe or VPInstruction");
diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index 9e1c0c1b115ab..12ae241dda4bd 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -262,20 +262,28 @@ define i128 @u128_mul(i128 %x, i128 %y) {
 define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_checked_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB17_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
 ; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
 ; CHECK-NEXT:    eor w2, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB17_2: // %overflow.no
+; CHECK-NEXT:    umulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    eor w2, w8, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -290,19 +298,27 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_overflowing_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB18_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
 ; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w2, w8, wzr, lo
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB18_2: // %overflow.no
+; CHECK-NEXT:    umulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    mov w2, wzr
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -316,19 +332,28 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 define i128 @u128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_saturating_mul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul x9, x3, x0
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB19_2
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    mul x8, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
-; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x9, x3, x0
+; CHECK-NEXT:    madd x11, x1, x2, x8
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x12, x0, x2
+; CHECK-NEXT:    ccmp xzr, x9, #0, eq
 ; CHECK-NEXT:    mul x8, x0, x2
 ; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    adds x9, x11, x9
+; CHECK-NEXT:    adds x9, x12, x11
 ; CHECK-NEXT:    csinc w10, w10, wzr, lo
+; CHECK-NEXT:    b .LBB19_3
+; CHECK-NEXT:  .LBB19_2: // %overflow.no
+; CHECK-NEXT:    umulh x9, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:  .LBB19_3: // %overflow.res
 ; CHECK-NEXT:    cmp w10, #0
 ; CHECK-NEXT:    csinv x0, x8, xzr, eq
 ; CHECK-NEXT:    csinv x1, x9, xzr, eq
@@ -355,6 +380,11 @@ define i128 @i128_mul(i128 %x, i128 %y) {
 define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_checked_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cbz x8, .LBB21_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -364,24 +394,30 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x9, x8, x9
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
 ; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    adds x8, x14, x10
-; CHECK-NEXT:    mul x15, x1, x3
-; CHECK-NEXT:    smulh x10, x1, x3
-; CHECK-NEXT:    mov x1, x8
-; CHECK-NEXT:    adc x11, x12, x13
-; CHECK-NEXT:    asr x12, x9, #63
-; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    adds x9, x9, x11
 ; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adc x12, x12, x13
-; CHECK-NEXT:    adds x9, x15, x9
-; CHECK-NEXT:    adc x10, x10, x12
-; CHECK-NEXT:    cmp x9, x11
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w2, eq
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    eor w2, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB21_2: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    eor w2, w8, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -396,6 +432,11 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_overflowing_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cbz x8, .LBB22_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -405,24 +446,29 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x9, x8, x9
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
 ; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    adds x8, x14, x10
-; CHECK-NEXT:    mul x15, x1, x3
-; CHECK-NEXT:    smulh x10, x1, x3
-; CHECK-NEXT:    mov x1, x8
-; CHECK-NEXT:    adc x11, x12, x13
-; CHECK-NEXT:    asr x12, x9, #63
-; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    adds x9, x9, x11
 ; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adc x12, x12, x13
-; CHECK-NEXT:    adds x9, x15, x9
-; CHECK-NEXT:    adc x10, x10, x12
-; CHECK-NEXT:    cmp x9, x11
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
 ; CHECK-NEXT:    cset w2, ne
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB22_2: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    mov w2, wzr
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -436,6 +482,11 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_saturating_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cbz x8, .LBB23_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -445,29 +496,35 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    adc x9, x8, x9
 ; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    adds x9, x14, x10
-; CHECK-NEXT:    mul x11, x1, x3
-; CHECK-NEXT:    adc x10, x12, x13
-; CHECK-NEXT:    smulh x12, x1, x3
-; CHECK-NEXT:    asr x13, x8, #63
-; CHECK-NEXT:    asr x14, x10, #63
-; CHECK-NEXT:    adds x8, x8, x10
-; CHECK-NEXT:    adc x10, x13, x14
-; CHECK-NEXT:    adds x8, x11, x8
-; CHECK-NEXT:    asr x11, x9, #63
-; CHECK-NEXT:    mul x13, x0, x2
-; CHECK-NEXT:    adc x10, x12, x10
-; CHECK-NEXT:    eor x12, x3, x1
-; CHECK-NEXT:    eor x8, x8, x11
-; CHECK-NEXT:    eor x10, x10, x11
-; CHECK-NEXT:    asr x11, x12, #63
-; CHECK-NEXT:    orr x8, x8, x10
-; CHECK-NEXT:    eor x10, x11, #0x7fffffffffffffff
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    csinv x0, x13, x11, eq
-; CHECK-NEXT:    csel x1, x10, x9, ne
+; CHECK-NEXT:    adds x8, x14, x10
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    asr x14, x8, #63
+; CHECK-NEXT:    smulh x10, x1, x3
+; CHECK-NEXT:    adc x11, x12, x13
+; CHECK-NEXT:    asr x12, x9, #63
+; CHECK-NEXT:    asr x13, x11, #63
+; CHECK-NEXT:    adds x11, x9, x11
+; CHECK-NEXT:    mul x9, x0, x2
+; CHECK-NEXT:    adc x12, x12, x13
+; CHECK-NEXT:    adds x11, x15, x11
+; CHECK-NEXT:    adc x10, x10, x12
+; CHECK-NEXT:    cmp x11, x14
+; CHECK-NEXT:    ccmp x10, x14, #0, eq
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    b .LBB23_3
+; CHECK-NEXT:  .LBB23_2: // %overflow.no
+; CHECK-NEXT:    smulh x8, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    mul x9, x0, x2
+; CHECK-NEXT:  .LBB23_3: // %overflow.res
+; CHECK-NEXT:    eor x11, x3, x1
+; CHECK-NEXT:    cmp w10, #0
+; CHECK-NEXT:    asr x11, x11, #63
+; CHECK-NEXT:    eor x12, x11, #0x7fffffffffffffff
+; CHECK-NEXT:    csinv x0, x9, x11, eq
+; CHECK-NEXT:    csel x1, x12, x8, ne
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
index 9924b7c63f763..3d90e094a5747 100644
--- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
+++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
@@ -224,21 +224,29 @@ cleanup:
 define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_umul_i128:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB4_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
+; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    cmp w8, #1
-; CHECK-NEXT:    b.ne .LBB4_2
-; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    cbnz w8, .LBB4_3
+; CHECK-NEXT:    b .LBB4_4
+; CHECK-NEXT:  .LBB4_2: // %overflow.no
+; CHECK-NEXT:    umulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    cbz w8, .LBB4_4
+; CHECK-NEXT:  .LBB4_3: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -247,9 +255,7 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB4_2: // %if.end
-; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:  .LBB4_4: // %cleanup
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
@@ -273,34 +279,40 @@ cleanup:
 define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_smul_i128:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    asr x10, x1, #63
-; CHECK-NEXT:    umulh x11, x0, x2
-; CHECK-NEXT:    asr x14, x3, #63
-; CHECK-NEXT:    mov x8, x1
-; CHECK-NEXT:    mul x12, x1, x2
-; CHECK-NEXT:    umulh x9, x1, x2
-; CHECK-NEXT:    mul x10, x10, x2
-; CHECK-NEXT:    adds x11, x12, x11
-; CHECK-NEXT:    mul x15, x0, x3
-; CHECK-NEXT:    umulh x13, x0, x3
-; CHECK-NEXT:    adc x9, x9, x10
-; CHECK-NEXT:    mul x14, x0, x14
-; CHECK-NEXT:    mul x16, x1, x3
-; CHECK-NEXT:    adds x1, x15, x11
-; CHECK-NEXT:    asr x11, x9, #63
-; CHECK-NEXT:    smulh x8, x8, x3
-; CHECK-NEXT:    adc x10, x13, x14
-; CHECK-NEXT:    asr x12, x10, #63
-; CHECK-NEXT:    adds x9, x9, x10
-; CHECK-NEXT:    adc x10, x11, x12
-; CHECK-NEXT:    adds x9, x16, x9
-; CHECK-NEXT:    asr x11, x1, #63
-; CHECK-NEXT:    adc x8, x8, x10
-; CHECK-NEXT:    eor x8, x8, x11
-; CHECK-NEXT:    eor x9, x9, x11
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
 ; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cbz x8, .LBB5_2
-; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    cbz x8, .LBB5_4
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    umulh x10, x0, x2
+; CHECK-NEXT:    asr x13, x3, #63
+; CHECK-NEXT:    mul x11, x1, x2
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    mul x9, x9, x2
+; CHECK-NEXT:    adds x10, x11, x10
+; CHECK-NEXT:    mul x14, x0, x3
+; CHECK-NEXT:    umulh x12, x0, x3
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
+; CHECK-NEXT:    mul x13, x0, x13
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    cbz w8, .LBB5_3
+; CHECK-NEXT:  .LBB5_2: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -309,10 +321,13 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB5_3: // %cleanup
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB5_2: // %if.end
+; CHECK-NEXT:  .LBB5_4: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    cbnz w8, .LBB5_2
+; CHECK-NEXT:    b .LBB5_3
 entry:
   %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %1 = extractvalue { i128, i1 } %0, 1
diff --git a/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll b/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll
new file mode 100644
index 0000000000000..7b60f81539aa8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll
@@ -0,0 +1,261 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s
+
+
+declare i32 @error()
+
+define i128 @test1(i128 noundef %x, i128 noundef %y) {
+; CHECK-LABEL: test1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cbz x8, .LBB0_4
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    umulh x10, x0, x2
+; CHECK-NEXT:    asr x13, x3, #63
+; CHECK-NEXT:    mul x11, x1, x2
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    mul x9, x9, x2
+; CHECK-NEXT:    adds x10, x11, x10
+; CHECK-NEXT:    mul x14, x0, x3
+; CHECK-NEXT:    umulh x12, x0, x3
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
+; CHECK-NEXT:    mul x13, x0, x13
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    cbz w8, .LBB0_3
+; CHECK-NEXT:  .LBB0_2: // %if.then
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl error
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x0, w0
+; CHECK-NEXT:    asr x1, x0, #63
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB0_3: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_4: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    cbnz w8, .LBB0_2
+; CHECK-NEXT:    b .LBB0_3
+entry:
+  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
+  %1 = extractvalue { i128, i1 } %0, 1
+  br i1 %1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @error()
+  %conv1 = sext i32 %call to i128
+  br label %cleanup
+
+if.end:
+  %2 = extractvalue { i128, i1 } %0, 0
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i128 [ %conv1, %if.then ], [ %2, %if.end ]
+  ret i128 %retval.0
+}
+
+define i128 @test2(i128 noundef %x, i128 noundef %y, ptr %out) {
+; CHECK-LABEL: test2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cbz x8, .LBB1_4
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    umulh x10, x0, x2
+; CHECK-NEXT:    asr x13, x3, #63
+; CHECK-NEXT:    mul x11, x1, x2
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    mul x9, x9, x2
+; CHECK-NEXT:    adds x10, x11, x10
+; CHECK-NEXT:    mul x14, x0, x3
+; CHECK-NEXT:    umulh x12, x0, x3
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
+; CHECK-NEXT:    mul x13, x0, x13
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    stp x0, x1, [x4]
+; CHECK-NEXT:    cbz w8, .LBB1_3
+; CHECK-NEXT:  .LBB1_2: // %if.then
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl error
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x0, w0
+; CHECK-NEXT:    asr x1, x0, #63
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB1_3: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_4: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    stp x0, x1, [x4]
+; CHECK-NEXT:    cbnz w8, .LBB1_2
+; CHECK-NEXT:    b .LBB1_3
+entry:
+  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
+  %1 = extractvalue { i128, i1 } %0, 0
+  store i128 %1, ptr %out
+  %2 = extractvalue { i128, i1 } %0, 1
+  br i1 %2, label %if.then, label %cleanup
+
+if.then:
+  %call = tail call i32 @error()
+  %conv1 = sext i32 %call to i128
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i128 [ %conv1, %if.then ], [ %1, %entry ]
+  ret i128 %retval.0
+}
+
+define i128 @test3(i128 noundef %x, i128 noundef %y, ptr %out) {
+; CHECK-LABEL: test3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB2_3
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    mul x8, x3, x0
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    ccmp x3, #0, #4, ne
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x9, x3, x0
+; CHECK-NEXT:    madd x11, x1, x2, x8
+; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x12, x0, x2
+; CHECK-NEXT:    ccmp xzr, x9, #0, eq
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    adds x9, x12, x11
+; CHECK-NEXT:    csinc w10, w10, wzr, lo
+; CHECK-NEXT:    stp x8, x9, [x4]
+; CHECK-NEXT:    cbnz w10, .LBB2_4
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    mov x1, xzr
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_3: // %overflow.no
+; CHECK-NEXT:    umulh x9, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    stp x8, x9, [x4]
+; CHECK-NEXT:    cbz w10, .LBB2_2
+; CHECK-NEXT:  .LBB2_4: // %if.then
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl error
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x0, w0
+; CHECK-NEXT:    asr x1, x0, #63
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
+  %1 = extractvalue { i128, i1 } %0, 0
+  store i128 %1, ptr %out
+  %2 = extractvalue { i128, i1 } %0, 1
+  br i1 %2, label %if.then, label %cleanup
+
+if.then:
+  %call = tail call i32 @error()
+  %conv1 = sext i32 %call to i128
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i128 [ %conv1, %if.then ], [ 1, %entry ]
+  ret i128 %retval.0
+}
+
+define i128 @test4(i128 noundef %x, i128 noundef %y, i128 %out) {
+; CHECK-LABEL: test4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB3_2
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    mul x8, x3, x0
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    ccmp x3, #0, #4, ne
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x9, x3, x0
+; CHECK-NEXT:    madd x11, x1, x2, x8
+; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x12, x0, x2
+; CHECK-NEXT:    ccmp xzr, x9, #0, eq
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    adds x9, x12, x11
+; CHECK-NEXT:    csinc w10, w10, wzr, lo
+; CHECK-NEXT:    b .LBB3_3
+; CHECK-NEXT:  .LBB3_2: // %overflow.no
+; CHECK-NEXT:    umulh x9, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:  .LBB3_3: // %overflow.res
+; CHECK-NEXT:    adds x0, x8, x4
+; CHECK-NEXT:    adc x1, x9, x5
+; CHECK-NEXT:    cbz w10, .LBB3_5
+; CHECK-NEXT:  // %bb.4: // %if.then
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl error
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x0, w0
+; CHECK-NEXT:    asr x1, x0, #63
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB3_5: // %cleanup
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
+  %1 = extractvalue { i128, i1 } %0, 0
+  %res = add i128 %1, %out
+  %2 = extractvalue { i128, i1 } %0, 1
+  br i1 %2, label %if.then, label %cleanup
+
+if.then:
+  %call = tail call i32 @error()
+  %conv1 = sext i32 %call to i128
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i128 [ %conv1, %if.then ], [ %res, %entry ]
+  ret i128 %retval.0
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index 16e8feb0dc5bb..fc3e018f2ec7a 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -632,7 +632,6 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    uunpkhi z3.s, z2.h
 ; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    mov w8, #32768 // =0x8000
 ; SVE-NEXT:    uunpklo z2.s, z2.h
 ; SVE-NEXT:    uunpklo z1.s, z1.h
 ; SVE-NEXT:    ptrue p1.s
@@ -643,9 +642,8 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 ; SVE-NEXT:    fmul z3.s, z4.s, z3.s
 ; SVE-NEXT:    fmul z1.s, z1.s, z2.s
 ; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT:    fmov h3, w8
+; SVE-NEXT:    dupm z3.h, #0x8000
 ; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT:    mov z3.h, h3
 ; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
 ; SVE-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; SVE-NEXT:    uunpkhi z3.s, z0.h
@@ -665,10 +663,8 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 ;
 ; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
 ; SVE-B16B16:       // %bb.0:
-; SVE-B16B16-NEXT:    mov w8, #32768 // =0x8000
+; SVE-B16B16-NEXT:    dupm z3.h, #0x8000
 ; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
-; SVE-B16B16-NEXT:    fmov h3, w8
-; SVE-B16B16-NEXT:    mov z3.h, h3
 ; SVE-B16B16-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; SVE-B16B16-NEXT:    bfsub z0.h, z0.h, z1.h
 ; SVE-B16B16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-combine.ll b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll
index 53aba04028d62..57389ad2fe9b2 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll
@@ -1134,10 +1134,9 @@ define <vscale x 2 x double> @fadd_sel_fmul_d_negzero(<vscale x 2 x double> %a,
 define <vscale x 8 x half> @fsub_sel_fmul_h_negzero(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: fsub_sel_fmul_h_negzero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-NEXT:    dupm z3.h, #0x8000
 ; CHECK-NEXT:    fmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    sel z1.h, p0, z1.h, z2.h
+; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    fsub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %fmul = fmul <vscale x 8 x half> %b, %c
@@ -1150,10 +1149,9 @@ define <vscale x 8 x half> @fsub_sel_fmul_h_negzero(<vscale x 8 x half> %a, <vsc
 define <vscale x 4 x float> @fsub_sel_fmul_s_negzero(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: fsub_sel_fmul_s_negzero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT:    mov z3.s, #0x80000000
 ; CHECK-NEXT:    fmul z1.s, z1.s, z2.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    sel z1.s, p0, z1.s, z2.s
+; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    fsub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %fmul = fmul <vscale x 4 x float> %b, %c
@@ -1166,10 +1164,9 @@ define <vscale x 4 x float> @fsub_sel_fmul_s_negzero(<vscale x 4 x float> %a, <v
 define <vscale x 2 x double> @fsub_sel_fmul_d_negzero(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: fsub_sel_fmul_d_negzero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    fmul z1.d, z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    sel z1.d, p0, z1.d, z2.d
+; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    fsub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %fmul = fmul <vscale x 2 x double> %b, %c
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
index 8750867c56731..1223ae1c0cbdd 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
@@ -51,10 +51,9 @@ define half @fadda_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-NEXT:    dupm z2.h, #0x8000
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    str z0, [sp]
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fmov s0, s1
 ; CHECK-NEXT:    st1h { z2.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    ptrue p0.h
@@ -77,12 +76,11 @@ define half @fadda_nxv10f16(<vscale x 10 x half> %v, half %s) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-NEXT:    str z1, [sp]
+; CHECK-NEXT:    addvl x8, sp, #1
 ; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    fadda h2, p0, h2, z0.h
-; CHECK-NEXT:    mov z0.h, w8
-; CHECK-NEXT:    addvl x8, sp, #1
+; CHECK-NEXT:    dupm z0.h, #0x8000
 ; CHECK-NEXT:    st1h { z0.d }, p1, [sp, #1, mul vl]
 ; CHECK-NEXT:    ldr z1, [sp]
 ; CHECK-NEXT:    str z1, [sp, #1, mul vl]
@@ -105,11 +103,10 @@ define half @fadda_nxv12f16(<vscale x 12 x half> %v, half %s) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    fadda h2, p0, h2, z0.h
-; CHECK-NEXT:    uunpklo z0.s, z1.h
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
 ; CHECK-NEXT:    fadda h2, p0, h2, z0.h
 ; CHECK-NEXT:    fmov s0, s2
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
index 4ae7ac7b292e9..897ade00320db 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
@@ -454,18 +454,17 @@ declare <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f16.nxv4i64(<vscale x 4 x half>)
 define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 ; CHECK-LABEL: test_signed_v2f16_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.d, #0xffffffff80000000
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x7fffffff
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.d, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
-; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f16.nxv2i32(<vscale x 2 x half> %f)
@@ -475,18 +474,17 @@ define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    mov z2.s, #0x80000000
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.s, #0x80000000
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.s, #0x7fffffff
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fcvtzs z2.s, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.s, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    fcvtzs z1.s, p1/m, z0.h
-; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f16.nxv4i32(<vscale x 4 x half> %f)
@@ -496,26 +494,25 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
+; CHECK-NEXT:    uunpklo z2.s, z0.h
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    mov z3.s, #0x80000000
 ; CHECK-NEXT:    mov z4.s, #0x80000000
 ; CHECK-NEXT:    mov z5.h, w8
-; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.s, #0x7fffffff
+; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, z1.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.s, #0x7fffffff
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z0.h, z5.h
-; CHECK-NEXT:    fcvtzs z3.s, p1/m, z1.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z3.s, p1/m, z2.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z5.h
 ; CHECK-NEXT:    fcvtzs z4.s, p2/m, z0.h
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z0.s, p1, z2.s, z3.s
-; CHECK-NEXT:    sel z1.s, p3, z2.s, z4.s
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z3.s
+; CHECK-NEXT:    sel z1.s, p3, z1.s, z4.s
 ; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
@@ -526,18 +523,17 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #63488 // =0xf800
+; CHECK-NEXT:    dupm z1.h, #0xf800
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z2.s, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #30719 // =0x77ff
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.s, #32767 // =0x7fff
-; CHECK-NEXT:    fcvtzs z2.s, p1/m, z0.h
+; CHECK-NEXT:    mov z1.s, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcvtzs z1.s, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.s, #32767 // =0x7fff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z0.s, p2, z1.s, z2.s
+; CHECK-NEXT:    sel z0.s, p1, z2.s, z1.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f16.nxv4i16(<vscale x 4 x half> %f)
@@ -547,18 +543,17 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #63488 // =0xf800
+; CHECK-NEXT:    dupm z1.h, #0xf800
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z2.h, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #30719 // =0x77ff
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.h, #32767 // =0x7fff
-; CHECK-NEXT:    fcvtzs z2.h, p1/m, z0.h
+; CHECK-NEXT:    mov z1.h, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcvtzs z1.h, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.h, #32767 // =0x7fff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z0.h, p2, z1.h, z2.h
+; CHECK-NEXT:    sel z0.h, p1, z2.h, z1.h
 ; CHECK-NEXT:    mov z0.h, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8f16.nxv8i16(<vscale x 8 x half> %f)
@@ -568,18 +563,17 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK-LABEL: test_signed_v2f16_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.d, #0x8000000000000000
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
-; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f16.nxv2i64(<vscale x 2 x half> %f)
@@ -589,26 +583,25 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
+; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z5.h, w8
-; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, z1.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z0.h, z5.h
-; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z2.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z5.h
 ; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.h
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
-; CHECK-NEXT:    sel z1.d, p3, z2.d, z4.d
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z1.d, z4.d
 ; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-llrint.ll b/llvm/test/CodeGen/AArch64/sve-llrint.ll
index f964d70e0a05c..c2bb0c81ab405 100644
--- a/llvm/test/CodeGen/AArch64/sve-llrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-llrint.ll
@@ -5,9 +5,8 @@ define <vscale x 1 x i64> @llrint_v1i64_v1f16(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: llrint_v1i64_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
@@ -28,9 +27,8 @@ define <vscale x 2 x i64> @llrint_v1i64_v2f16(<vscale x 2 x half> %x) {
 ; CHECK-LABEL: llrint_v1i64_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
@@ -52,10 +50,9 @@ define <vscale x 4 x i64> @llrint_v4i64_v4f16(<vscale x 4 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z5.d, #0x7fffffffffffffff
@@ -92,10 +89,9 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z4.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z7.d, #0x8000000000000000
@@ -162,12 +158,13 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    uunpklo z7.s, z1.h
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z0.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    mov z31.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpklo z24.d, z3.s
@@ -175,10 +172,8 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    uunpkhi z6.d, z2.s
 ; CHECK-NEXT:    uunpklo z26.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z30.d, z1.s
-; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    movprfx z27, z4
@@ -191,17 +186,17 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
 ; CHECK-NEXT:    frintx z7.h, p0/m, z7.h
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z2.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z2.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z2.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z28.h, z2.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z26.h, z2.h
-; CHECK-NEXT:    fcvtzs z0.d, p1/m, z27.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z0.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z0.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z0.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z28.h, z0.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z26.h, z0.h
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z27.h
 ; CHECK-NEXT:    fcvtzs z4.d, p3/m, z24.h
 ; CHECK-NEXT:    fcvtzs z5.d, p4/m, z25.h
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z27.h, z29.h
 ; CHECK-NEXT:    fcvtzs z3.d, p2/m, z28.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z7.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z7.h, z0.h
 ; CHECK-NEXT:    fcvtzs z6.d, p5/m, z26.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z27.h, z27.h
 ; CHECK-NEXT:    movprfx z27, z30
@@ -212,7 +207,7 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z28.h, z28.h
 ; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z31.d, p4/m, z7.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z0.h
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z24.h, z29.h
 ; CHECK-NEXT:    fcmuo p7.h, p0/z, z24.h, z24.h
 ; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
@@ -221,31 +216,31 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    fcmuo p10.h, p0/z, z25.h, z25.h
 ; CHECK-NEXT:    mov z25.d, #0x8000000000000000
 ; CHECK-NEXT:    sel z1.d, p5, z24.d, z3.d
-; CHECK-NEXT:    mov z0.d, p3/m, z24.d
 ; CHECK-NEXT:    sel z3.d, p8, z24.d, z5.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z0.h
+; CHECK-NEXT:    sel z0.d, p3, z24.d, z2.d
 ; CHECK-NEXT:    sel z2.d, p6, z24.d, z4.d
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p10/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Reload
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z26.h, z29.h
 ; CHECK-NEXT:    mov z2.d, p7/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z26.h, z29.h
 ; CHECK-NEXT:    fcvtzs z25.d, p4/m, z30.h
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p5.h, p0/z, z7.h, z29.h
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z27.h, z29.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z30.h, z29.h
 ; CHECK-NEXT:    sel z4.d, p9, z24.d, z6.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z30.h, z29.h
 ; CHECK-NEXT:    fcmuo p8.h, p0/z, z7.h, z7.h
 ; CHECK-NEXT:    sel z5.d, p5, z24.d, z31.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    sel z6.d, p6, z24.d, z28.d
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    fcmuo p9.h, p0/z, z27.h, z27.h
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    sel z7.d, p4, z24.d, z25.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
@@ -302,48 +297,47 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
 ; CHECK-NEXT:    uunpklo z4.s, z0.h
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
-; CHECK-NEXT:    mov w9, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z6.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z28.s, z1.h
-; CHECK-NEXT:    mov z30.h, w9
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w9, #31743 // =0x7bff
+; CHECK-NEXT:    uunpklo z5.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z28.h, #-1025 // =0xfffffffffffffbff
+; CHECK-NEXT:    uunpkhi z29.s, z1.h
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z13.s, z2.h
 ; CHECK-NEXT:    mov z9.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpkhi z14.s, z2.h
 ; CHECK-NEXT:    uunpkhi z17.s, z3.h
-; CHECK-NEXT:    uunpklo z7.d, z4.s
+; CHECK-NEXT:    uunpklo z6.d, z4.s
 ; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z27.d, z5.s
-; CHECK-NEXT:    uunpklo z31.d, z6.s
-; CHECK-NEXT:    uunpkhi z8.d, z6.s
-; CHECK-NEXT:    uunpkhi z29.d, z5.s
-; CHECK-NEXT:    uunpkhi z11.d, z28.s
-; CHECK-NEXT:    uunpklo z10.d, z28.s
+; CHECK-NEXT:    uunpklo z27.d, z0.s
+; CHECK-NEXT:    uunpklo z31.d, z5.s
+; CHECK-NEXT:    uunpkhi z8.d, z5.s
+; CHECK-NEXT:    uunpkhi z30.d, z0.s
+; CHECK-NEXT:    uunpkhi z11.d, z29.s
+; CHECK-NEXT:    uunpklo z10.d, z29.s
 ; CHECK-NEXT:    uunpklo z15.s, z3.h
 ; CHECK-NEXT:    uunpklo z16.d, z14.s
 ; CHECK-NEXT:    uunpkhi z14.d, z14.s
 ; CHECK-NEXT:    mov z24.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z1, z7
-; CHECK-NEXT:    frintx z1.h, p0/m, z7.h
 ; CHECK-NEXT:    movprfx z5, z27
 ; CHECK-NEXT:    frintx z5.h, p0/m, z27.h
+; CHECK-NEXT:    movprfx z1, z6
+; CHECK-NEXT:    frintx z1.h, p0/m, z6.h
 ; CHECK-NEXT:    frintx z4.h, p0/m, z4.h
 ; CHECK-NEXT:    movprfx z12, z31
 ; CHECK-NEXT:    frintx z12.h, p0/m, z31.h
 ; CHECK-NEXT:    movprfx z27, z8
 ; CHECK-NEXT:    frintx z27.h, p0/m, z8.h
-; CHECK-NEXT:    movprfx z6, z29
-; CHECK-NEXT:    frintx z6.h, p0/m, z29.h
+; CHECK-NEXT:    movprfx z6, z30
+; CHECK-NEXT:    frintx z6.h, p0/m, z30.h
 ; CHECK-NEXT:    movprfx z31, z10
 ; CHECK-NEXT:    frintx z31.h, p0/m, z10.h
-; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
 ; CHECK-NEXT:    movprfx z3, z16
 ; CHECK-NEXT:    frintx z3.h, p0/m, z16.h
-; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
-; CHECK-NEXT:    mov z29.h, w9
+; CHECK-NEXT:    mov z30.h, w9
 ; CHECK-NEXT:    uunpklo z10.d, z13.s
 ; CHECK-NEXT:    uunpkhi z13.d, z13.s
 ; CHECK-NEXT:    uunpkhi z20.d, z15.s
@@ -354,124 +348,124 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    uunpklo z15.d, z15.s
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
 ; CHECK-NEXT:    mov z26.d, #0x8000000000000000
-; CHECK-NEXT:    mov z28.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z29.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    movprfx z19, z13
 ; CHECK-NEXT:    frintx z19.h, p0/m, z13.h
 ; CHECK-NEXT:    movprfx z13, z14
 ; CHECK-NEXT:    frintx z13.h, p0/m, z14.h
-; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
 ; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
 ; CHECK-NEXT:    mov z22.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z23.d, #0x8000000000000000
-; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
 ; CHECK-NEXT:    mov z14.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z30.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z12.h, z30.h
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z12.h, z29.h
+; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z28.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z12.h, z28.h
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z12.h, z30.h
 ; CHECK-NEXT:    fcmuo p8.h, p0/z, z12.h, z12.h
 ; CHECK-NEXT:    fcvtzs z7.d, p4/m, z4.h
 ; CHECK-NEXT:    fcvtzs z8.d, p2/m, z12.h
 ; CHECK-NEXT:    mov z12.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z30.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z28.h
 ; CHECK-NEXT:    fcmuo p10.h, p0/z, z11.h, z11.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z30.h
-; CHECK-NEXT:    mov z8.d, p9/m, z28.d
+; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z28.h
+; CHECK-NEXT:    mov z8.d, p9/m, z29.d
 ; CHECK-NEXT:    fcvtzs z9.d, p4/m, z27.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z11.h, z30.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z24.d, p3/m, z5.h
 ; CHECK-NEXT:    mov z8.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z30.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z1.h, z30.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z28.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z1.h, z28.h
 ; CHECK-NEXT:    str z8, [x8, #4, mul vl]
 ; CHECK-NEXT:    fcvtzs z12.d, p4/m, z11.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z29.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    uunpkhi z11.d, z17.s
 ; CHECK-NEXT:    movprfx z17, z20
 ; CHECK-NEXT:    frintx z17.h, p0/m, z20.h
 ; CHECK-NEXT:    fcvtzs z25.d, p1/m, z6.h
 ; CHECK-NEXT:    mov z20.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z0.d, p5/m, z1.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z10.h, z30.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z10.h, z28.h
 ; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z31.h, z30.h
-; CHECK-NEXT:    fcmge p1.h, p0/z, z13.h, z30.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z31.h, z28.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z13.h, z28.h
 ; CHECK-NEXT:    fcvtzs z18.d, p6/m, z10.h
-; CHECK-NEXT:    fcmgt p11.h, p0/z, z10.h, z29.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z30.h
+; CHECK-NEXT:    fcmgt p11.h, p0/z, z10.h, z30.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z2.d, p3/m, z31.h
 ; CHECK-NEXT:    fcvtzs z21.d, p1/m, z13.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z17.h, z30.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z16.h, z30.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z17.h, z28.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z16.h, z28.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z10.h, z10.h
-; CHECK-NEXT:    sel z10.d, p4, z28.d, z12.d
-; CHECK-NEXT:    sel z12.d, p11, z28.d, z18.d
+; CHECK-NEXT:    sel z10.d, p4, z29.d, z12.d
+; CHECK-NEXT:    sel z12.d, p11, z29.d, z18.d
 ; CHECK-NEXT:    fcvtzs z26.d, p5/m, z11.h
 ; CHECK-NEXT:    fcvtzs z22.d, p2/m, z17.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z29.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    fcvtzs z23.d, p3/m, z16.h
 ; CHECK-NEXT:    mov z10.d, p10/m, #0 // =0x0
 ; CHECK-NEXT:    mov z12.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p6.h, p0/z, z19.h, z30.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z19.h, z28.h
 ; CHECK-NEXT:    str z10, [x8, #7, mul vl]
-; CHECK-NEXT:    fcmge p7.h, p0/z, z3.h, z30.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z3.h, z28.h
 ; CHECK-NEXT:    str z12, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z26.d, p4/m, z28.d
-; CHECK-NEXT:    fcmge p2.h, p0/z, z15.h, z30.h
-; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    mov z26.d, p4/m, z29.d
+; CHECK-NEXT:    fcmge p2.h, p0/z, z15.h, z28.h
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z14.d, p6/m, z19.h
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z29.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z29.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z30.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z30.h
 ; CHECK-NEXT:    fcvtzs z20.d, p7/m, z3.h
-; CHECK-NEXT:    fcvtzs z30.d, p2/m, z15.h
+; CHECK-NEXT:    fcvtzs z28.d, p2/m, z15.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z11.h, z11.h
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z16.h, z16.h
-; CHECK-NEXT:    sel z11.d, p5, z28.d, z23.d
-; CHECK-NEXT:    sel z16.d, p3, z28.d, z22.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z19.h, z29.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z29.h
+; CHECK-NEXT:    sel z11.d, p5, z29.d, z23.d
+; CHECK-NEXT:    sel z16.d, p3, z29.d, z22.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z19.h, z30.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z30.h
 ; CHECK-NEXT:    mov z26.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z11.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z13.h, z29.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z13.h, z30.h
 ; CHECK-NEXT:    fcmuo p6.h, p0/z, z17.h, z17.h
 ; CHECK-NEXT:    str z26, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z26.d, p4, z28.d, z14.d
+; CHECK-NEXT:    sel z26.d, p4, z29.d, z14.d
 ; CHECK-NEXT:    str z11, [x8, #14, mul vl]
-; CHECK-NEXT:    mov z30.d, p3/m, z28.d
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z29.h
+; CHECK-NEXT:    mov z28.d, p3/m, z29.d
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z30.h
 ; CHECK-NEXT:    fcmuo p4.h, p0/z, z13.h, z13.h
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    sel z3.d, p1, z28.d, z21.d
+; CHECK-NEXT:    sel z3.d, p1, z29.d, z21.d
 ; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p12.h, p0/z, z27.h, z29.h
-; CHECK-NEXT:    sel z11.d, p2, z28.d, z20.d
+; CHECK-NEXT:    fcmgt p12.h, p0/z, z27.h, z30.h
+; CHECK-NEXT:    sel z11.d, p2, z29.d, z20.d
 ; CHECK-NEXT:    str z16, [x8, #13, mul vl]
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p6.h, p0/z, z15.h, z15.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z29.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z30.h
 ; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z9.d, p12/m, z28.d
+; CHECK-NEXT:    mov z9.d, p12/m, z29.d
 ; CHECK-NEXT:    str z3, [x8, #11, mul vl]
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z19.h, z19.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z29.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z30.h
 ; CHECK-NEXT:    str z11, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z30.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    sel z3.d, p1, z28.d, z7.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z6.h, z29.h
+; CHECK-NEXT:    mov z28.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    sel z3.d, p1, z29.d, z7.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z6.h, z30.h
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z27.h, z27.h
-; CHECK-NEXT:    str z30, [x8, #12, mul vl]
+; CHECK-NEXT:    str z28, [x8, #12, mul vl]
 ; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    sel z7.d, p2, z28.d, z24.d
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z31.h, z29.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z29.h
+; CHECK-NEXT:    sel z7.d, p2, z29.d, z24.d
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z31.h, z30.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z30.h
 ; CHECK-NEXT:    str z26, [x8, #9, mul vl]
-; CHECK-NEXT:    sel z24.d, p4, z28.d, z25.d
+; CHECK-NEXT:    sel z24.d, p4, z29.d, z25.d
 ; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z31.h, z31.h
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    mov z2.d, p6/m, z28.d
+; CHECK-NEXT:    mov z2.d, p6/m, z29.d
 ; CHECK-NEXT:    str z9, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z0.d, p1/m, z28.d
+; CHECK-NEXT:    mov z0.d, p1/m, z29.d
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z5.h, z5.h
 ; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
 ; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
diff --git a/llvm/test/CodeGen/AArch64/sve-lrint.ll b/llvm/test/CodeGen/AArch64/sve-lrint.ll
index f517e7fe8dc16..f1224d30d53cc 100644
--- a/llvm/test/CodeGen/AArch64/sve-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lrint.ll
@@ -6,9 +6,8 @@ define <vscale x 1 x iXLen> @lrint_v1f16(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: lrint_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
@@ -29,9 +28,8 @@ define <vscale x 2 x iXLen> @lrint_v2f16(<vscale x 2 x half> %x) {
 ; CHECK-LABEL: lrint_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
@@ -53,10 +51,9 @@ define <vscale x 4 x iXLen> @lrint_v4f16(<vscale x 4 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z5.d, #0x7fffffffffffffff
@@ -93,10 +90,9 @@ define <vscale x 8 x iXLen> @lrint_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z4.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z7.d, #0x8000000000000000
@@ -163,12 +159,13 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    uunpklo z7.s, z1.h
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z0.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    mov z31.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpklo z24.d, z3.s
@@ -176,10 +173,8 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    uunpkhi z6.d, z2.s
 ; CHECK-NEXT:    uunpklo z26.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z30.d, z1.s
-; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    movprfx z27, z4
@@ -192,17 +187,17 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
 ; CHECK-NEXT:    frintx z7.h, p0/m, z7.h
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z2.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z2.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z2.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z28.h, z2.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z26.h, z2.h
-; CHECK-NEXT:    fcvtzs z0.d, p1/m, z27.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z0.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z0.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z0.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z28.h, z0.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z26.h, z0.h
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z27.h
 ; CHECK-NEXT:    fcvtzs z4.d, p3/m, z24.h
 ; CHECK-NEXT:    fcvtzs z5.d, p4/m, z25.h
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z27.h, z29.h
 ; CHECK-NEXT:    fcvtzs z3.d, p2/m, z28.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z7.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z7.h, z0.h
 ; CHECK-NEXT:    fcvtzs z6.d, p5/m, z26.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z27.h, z27.h
 ; CHECK-NEXT:    movprfx z27, z30
@@ -213,7 +208,7 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z28.h, z28.h
 ; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z31.d, p4/m, z7.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z0.h
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z24.h, z29.h
 ; CHECK-NEXT:    fcmuo p7.h, p0/z, z24.h, z24.h
 ; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
@@ -222,31 +217,31 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    fcmuo p10.h, p0/z, z25.h, z25.h
 ; CHECK-NEXT:    mov z25.d, #0x8000000000000000
 ; CHECK-NEXT:    sel z1.d, p5, z24.d, z3.d
-; CHECK-NEXT:    mov z0.d, p3/m, z24.d
 ; CHECK-NEXT:    sel z3.d, p8, z24.d, z5.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z0.h
+; CHECK-NEXT:    sel z0.d, p3, z24.d, z2.d
 ; CHECK-NEXT:    sel z2.d, p6, z24.d, z4.d
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p10/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Reload
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z26.h, z29.h
 ; CHECK-NEXT:    mov z2.d, p7/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z26.h, z29.h
 ; CHECK-NEXT:    fcvtzs z25.d, p4/m, z30.h
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p5.h, p0/z, z7.h, z29.h
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z27.h, z29.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z30.h, z29.h
 ; CHECK-NEXT:    sel z4.d, p9, z24.d, z6.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z30.h, z29.h
 ; CHECK-NEXT:    fcmuo p8.h, p0/z, z7.h, z7.h
 ; CHECK-NEXT:    sel z5.d, p5, z24.d, z31.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    sel z6.d, p6, z24.d, z28.d
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    fcmuo p9.h, p0/z, z27.h, z27.h
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    sel z7.d, p4, z24.d, z25.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
@@ -303,48 +298,47 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
 ; CHECK-NEXT:    uunpklo z4.s, z0.h
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
-; CHECK-NEXT:    mov w9, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z6.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z28.s, z1.h
-; CHECK-NEXT:    mov z30.h, w9
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w9, #31743 // =0x7bff
+; CHECK-NEXT:    uunpklo z5.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z28.h, #-1025 // =0xfffffffffffffbff
+; CHECK-NEXT:    uunpkhi z29.s, z1.h
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z13.s, z2.h
 ; CHECK-NEXT:    mov z9.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpkhi z14.s, z2.h
 ; CHECK-NEXT:    uunpkhi z17.s, z3.h
-; CHECK-NEXT:    uunpklo z7.d, z4.s
+; CHECK-NEXT:    uunpklo z6.d, z4.s
 ; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z27.d, z5.s
-; CHECK-NEXT:    uunpklo z31.d, z6.s
-; CHECK-NEXT:    uunpkhi z8.d, z6.s
-; CHECK-NEXT:    uunpkhi z29.d, z5.s
-; CHECK-NEXT:    uunpkhi z11.d, z28.s
-; CHECK-NEXT:    uunpklo z10.d, z28.s
+; CHECK-NEXT:    uunpklo z27.d, z0.s
+; CHECK-NEXT:    uunpklo z31.d, z5.s
+; CHECK-NEXT:    uunpkhi z8.d, z5.s
+; CHECK-NEXT:    uunpkhi z30.d, z0.s
+; CHECK-NEXT:    uunpkhi z11.d, z29.s
+; CHECK-NEXT:    uunpklo z10.d, z29.s
 ; CHECK-NEXT:    uunpklo z15.s, z3.h
 ; CHECK-NEXT:    uunpklo z16.d, z14.s
 ; CHECK-NEXT:    uunpkhi z14.d, z14.s
 ; CHECK-NEXT:    mov z24.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z1, z7
-; CHECK-NEXT:    frintx z1.h, p0/m, z7.h
 ; CHECK-NEXT:    movprfx z5, z27
 ; CHECK-NEXT:    frintx z5.h, p0/m, z27.h
+; CHECK-NEXT:    movprfx z1, z6
+; CHECK-NEXT:    frintx z1.h, p0/m, z6.h
 ; CHECK-NEXT:    frintx z4.h, p0/m, z4.h
 ; CHECK-NEXT:    movprfx z12, z31
 ; CHECK-NEXT:    frintx z12.h, p0/m, z31.h
 ; CHECK-NEXT:    movprfx z27, z8
 ; CHECK-NEXT:    frintx z27.h, p0/m, z8.h
-; CHECK-NEXT:    movprfx z6, z29
-; CHECK-NEXT:    frintx z6.h, p0/m, z29.h
+; CHECK-NEXT:    movprfx z6, z30
+; CHECK-NEXT:    frintx z6.h, p0/m, z30.h
 ; CHECK-NEXT:    movprfx z31, z10
 ; CHECK-NEXT:    frintx z31.h, p0/m, z10.h
-; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
 ; CHECK-NEXT:    movprfx z3, z16
 ; CHECK-NEXT:    frintx z3.h, p0/m, z16.h
-; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
-; CHECK-NEXT:    mov z29.h, w9
+; CHECK-NEXT:    mov z30.h, w9
 ; CHECK-NEXT:    uunpklo z10.d, z13.s
 ; CHECK-NEXT:    uunpkhi z13.d, z13.s
 ; CHECK-NEXT:    uunpkhi z20.d, z15.s
@@ -355,124 +349,124 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    uunpklo z15.d, z15.s
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
 ; CHECK-NEXT:    mov z26.d, #0x8000000000000000
-; CHECK-NEXT:    mov z28.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z29.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    movprfx z19, z13
 ; CHECK-NEXT:    frintx z19.h, p0/m, z13.h
 ; CHECK-NEXT:    movprfx z13, z14
 ; CHECK-NEXT:    frintx z13.h, p0/m, z14.h
-; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
 ; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
 ; CHECK-NEXT:    mov z22.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z23.d, #0x8000000000000000
-; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
 ; CHECK-NEXT:    mov z14.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z30.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z12.h, z30.h
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z12.h, z29.h
+; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z28.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z12.h, z28.h
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z12.h, z30.h
 ; CHECK-NEXT:    fcmuo p8.h, p0/z, z12.h, z12.h
 ; CHECK-NEXT:    fcvtzs z7.d, p4/m, z4.h
 ; CHECK-NEXT:    fcvtzs z8.d, p2/m, z12.h
 ; CHECK-NEXT:    mov z12.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z30.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z28.h
 ; CHECK-NEXT:    fcmuo p10.h, p0/z, z11.h, z11.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z30.h
-; CHECK-NEXT:    mov z8.d, p9/m, z28.d
+; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z28.h
+; CHECK-NEXT:    mov z8.d, p9/m, z29.d
 ; CHECK-NEXT:    fcvtzs z9.d, p4/m, z27.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z11.h, z30.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z24.d, p3/m, z5.h
 ; CHECK-NEXT:    mov z8.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z30.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z1.h, z30.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z28.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z1.h, z28.h
 ; CHECK-NEXT:    str z8, [x8, #4, mul vl]
 ; CHECK-NEXT:    fcvtzs z12.d, p4/m, z11.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z29.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    uunpkhi z11.d, z17.s
 ; CHECK-NEXT:    movprfx z17, z20
 ; CHECK-NEXT:    frintx z17.h, p0/m, z20.h
 ; CHECK-NEXT:    fcvtzs z25.d, p1/m, z6.h
 ; CHECK-NEXT:    mov z20.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z0.d, p5/m, z1.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z10.h, z30.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z10.h, z28.h
 ; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z31.h, z30.h
-; CHECK-NEXT:    fcmge p1.h, p0/z, z13.h, z30.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z31.h, z28.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z13.h, z28.h
 ; CHECK-NEXT:    fcvtzs z18.d, p6/m, z10.h
-; CHECK-NEXT:    fcmgt p11.h, p0/z, z10.h, z29.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z30.h
+; CHECK-NEXT:    fcmgt p11.h, p0/z, z10.h, z30.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z2.d, p3/m, z31.h
 ; CHECK-NEXT:    fcvtzs z21.d, p1/m, z13.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z17.h, z30.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z16.h, z30.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z17.h, z28.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z16.h, z28.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z10.h, z10.h
-; CHECK-NEXT:    sel z10.d, p4, z28.d, z12.d
-; CHECK-NEXT:    sel z12.d, p11, z28.d, z18.d
+; CHECK-NEXT:    sel z10.d, p4, z29.d, z12.d
+; CHECK-NEXT:    sel z12.d, p11, z29.d, z18.d
 ; CHECK-NEXT:    fcvtzs z26.d, p5/m, z11.h
 ; CHECK-NEXT:    fcvtzs z22.d, p2/m, z17.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z29.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    fcvtzs z23.d, p3/m, z16.h
 ; CHECK-NEXT:    mov z10.d, p10/m, #0 // =0x0
 ; CHECK-NEXT:    mov z12.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p6.h, p0/z, z19.h, z30.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z19.h, z28.h
 ; CHECK-NEXT:    str z10, [x8, #7, mul vl]
-; CHECK-NEXT:    fcmge p7.h, p0/z, z3.h, z30.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z3.h, z28.h
 ; CHECK-NEXT:    str z12, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z26.d, p4/m, z28.d
-; CHECK-NEXT:    fcmge p2.h, p0/z, z15.h, z30.h
-; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    mov z26.d, p4/m, z29.d
+; CHECK-NEXT:    fcmge p2.h, p0/z, z15.h, z28.h
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z14.d, p6/m, z19.h
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z29.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z29.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z30.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z30.h
 ; CHECK-NEXT:    fcvtzs z20.d, p7/m, z3.h
-; CHECK-NEXT:    fcvtzs z30.d, p2/m, z15.h
+; CHECK-NEXT:    fcvtzs z28.d, p2/m, z15.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z11.h, z11.h
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z16.h, z16.h
-; CHECK-NEXT:    sel z11.d, p5, z28.d, z23.d
-; CHECK-NEXT:    sel z16.d, p3, z28.d, z22.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z19.h, z29.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z29.h
+; CHECK-NEXT:    sel z11.d, p5, z29.d, z23.d
+; CHECK-NEXT:    sel z16.d, p3, z29.d, z22.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z19.h, z30.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z30.h
 ; CHECK-NEXT:    mov z26.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z11.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z13.h, z29.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z13.h, z30.h
 ; CHECK-NEXT:    fcmuo p6.h, p0/z, z17.h, z17.h
 ; CHECK-NEXT:    str z26, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z26.d, p4, z28.d, z14.d
+; CHECK-NEXT:    sel z26.d, p4, z29.d, z14.d
 ; CHECK-NEXT:    str z11, [x8, #14, mul vl]
-; CHECK-NEXT:    mov z30.d, p3/m, z28.d
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z29.h
+; CHECK-NEXT:    mov z28.d, p3/m, z29.d
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z30.h
 ; CHECK-NEXT:    fcmuo p4.h, p0/z, z13.h, z13.h
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    sel z3.d, p1, z28.d, z21.d
+; CHECK-NEXT:    sel z3.d, p1, z29.d, z21.d
 ; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p12.h, p0/z, z27.h, z29.h
-; CHECK-NEXT:    sel z11.d, p2, z28.d, z20.d
+; CHECK-NEXT:    fcmgt p12.h, p0/z, z27.h, z30.h
+; CHECK-NEXT:    sel z11.d, p2, z29.d, z20.d
 ; CHECK-NEXT:    str z16, [x8, #13, mul vl]
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p6.h, p0/z, z15.h, z15.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z29.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z30.h
 ; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z9.d, p12/m, z28.d
+; CHECK-NEXT:    mov z9.d, p12/m, z29.d
 ; CHECK-NEXT:    str z3, [x8, #11, mul vl]
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z19.h, z19.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z29.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z30.h
 ; CHECK-NEXT:    str z11, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z30.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    sel z3.d, p1, z28.d, z7.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z6.h, z29.h
+; CHECK-NEXT:    mov z28.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    sel z3.d, p1, z29.d, z7.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z6.h, z30.h
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z27.h, z27.h
-; CHECK-NEXT:    str z30, [x8, #12, mul vl]
+; CHECK-NEXT:    str z28, [x8, #12, mul vl]
 ; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    sel z7.d, p2, z28.d, z24.d
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z31.h, z29.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z29.h
+; CHECK-NEXT:    sel z7.d, p2, z29.d, z24.d
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z31.h, z30.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z30.h
 ; CHECK-NEXT:    str z26, [x8, #9, mul vl]
-; CHECK-NEXT:    sel z24.d, p4, z28.d, z25.d
+; CHECK-NEXT:    sel z24.d, p4, z29.d, z25.d
 ; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z31.h, z31.h
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    mov z2.d, p6/m, z28.d
+; CHECK-NEXT:    mov z2.d, p6/m, z29.d
 ; CHECK-NEXT:    str z9, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z0.d, p1/m, z28.d
+; CHECK-NEXT:    mov z0.d, p1/m, z29.d
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z5.h, z5.h
 ; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
 ; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
index 5cca5539048b5..1ceaa5ad27734 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
@@ -509,6 +509,294 @@ define <vscale x 2 x bfloat> @splat_nxv2bf16_imm() {
   ret <vscale x 2 x bfloat> splat(bfloat 1.0)
 }
 
+define <vscale x 2 x half> @splat_nzero_nxv2f16() {
+; CHECK-LABEL: splat_nzero_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x half> splat (half -0.0)
+}
+
+define <vscale x 4 x half> @splat_nzero_nxv4f16() {
+; CHECK-LABEL: splat_nzero_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x half> splat (half -0.0)
+}
+
+define <vscale x 8 x half> @splat_nzero_nxv8f16() {
+; CHECK-LABEL: splat_nzero_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x half> splat (half -0.0)
+}
+
+define <vscale x 2 x float> @splat_nzero_nxv2f32() {
+; CHECK-LABEL: splat_nzero_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x80000000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x float> splat (float -0.0)
+}
+
+define <vscale x 4 x float> @splat_nzero_nxv4f32() {
+; CHECK-LABEL: splat_nzero_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x80000000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x float> splat (float -0.0)
+}
+
+define <vscale x 2 x double> @splat_nzero_nxv2f64() {
+; CHECK-LABEL: splat_nzero_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x double> splat (double -0.0)
+}
+
+define <vscale x 2 x bfloat> @splat_nzero_nxv2bf16() {
+; CHECK-LABEL: splat_nzero_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x bfloat> splat (bfloat -0.0)
+}
+
+define <vscale x 4 x bfloat> @splat_nzero_nxv4bf16() {
+; CHECK-LABEL: splat_nzero_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x bfloat> splat (bfloat -0.0)
+}
+
+define <vscale x 8 x bfloat> @splat_nzero_nxv8bf16() {
+; CHECK-LABEL: splat_nzero_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x bfloat> splat (bfloat -0.0)
+}
+
+define <vscale x 2 x half> @splat_pinf_nxv2f16() {
+; CHECK-LABEL: splat_pinf_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7c00
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x half> splat (half 0x7FF0000000000000)
+}
+
+define <vscale x 4 x half> @splat_pinf_nxv4f16() {
+; CHECK-LABEL: splat_pinf_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7c00
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x half> splat (half 0x7FF0000000000000)
+}
+
+define <vscale x 8 x half> @splat_pinf_nxv8f16() {
+; CHECK-LABEL: splat_pinf_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7c00
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x half> splat (half 0x7FF0000000000000)
+}
+
+define <vscale x 2 x float> @splat_pinf_nxv2f32() {
+; CHECK-LABEL: splat_pinf_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x7f800000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x float> splat (float 0x7FF0000000000000)
+}
+
+define <vscale x 4 x float> @splat_pinf_nxv4f32() {
+; CHECK-LABEL: splat_pinf_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x7f800000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x float> splat (float 0x7FF0000000000000)
+}
+
+define <vscale x 2 x double> @splat_pinf_nxv2f64() {
+; CHECK-LABEL: splat_pinf_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #0x7ff0000000000000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x double> splat (double 0x7FF0000000000000)
+}
+
+define <vscale x 2 x bfloat> @splat_pinf_nxv2bf16() {
+; CHECK-LABEL: splat_pinf_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32640 // =0x7f80
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x bfloat> splat (bfloat 0x7FF0000000000000)
+}
+
+define <vscale x 4 x bfloat> @splat_pinf_nxv4bf16() {
+; CHECK-LABEL: splat_pinf_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32640 // =0x7f80
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x bfloat> splat (bfloat 0x7FF0000000000000)
+}
+
+define <vscale x 8 x bfloat> @splat_pinf_nxv8bf16() {
+; CHECK-LABEL: splat_pinf_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32640 // =0x7f80
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x bfloat> splat (bfloat 0x7FF0000000000000)
+}
+
+define <vscale x 2 x half> @splat_ninf_nxv2f16() {
+; CHECK-LABEL: splat_ninf_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xfc00
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x half> splat (half 0xFFF0000000000000)
+}
+
+define <vscale x 4 x half> @splat_ninf_nxv4f16() {
+; CHECK-LABEL: splat_ninf_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xfc00
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x half> splat (half 0xFFF0000000000000)
+}
+
+define <vscale x 8 x half> @splat_ninf_nxv8f16() {
+; CHECK-LABEL: splat_ninf_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xfc00
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x half> splat (half 0xFFF0000000000000)
+}
+
+define <vscale x 2 x float> @splat_ninf_nxv2f32() {
+; CHECK-LABEL: splat_ninf_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0xff800000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x float> splat (float 0xFFF0000000000000)
+}
+
+define <vscale x 4 x float> @splat_ninf_nxv4f32() {
+; CHECK-LABEL: splat_ninf_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0xff800000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x float> splat (float 0xFFF0000000000000)
+}
+
+define <vscale x 2 x double> @splat_ninf_nxv2f64() {
+; CHECK-LABEL: splat_ninf_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #0xfff0000000000000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x double> splat (double 0xFFF0000000000000)
+}
+
+define <vscale x 2 x bfloat> @splat_ninf_nxv2bf16() {
+; CHECK-LABEL: splat_ninf_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xff80
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x bfloat> splat (bfloat 0xFFF0000000000000)
+}
+
+define <vscale x 4 x bfloat> @splat_ninf_nxv4bf16() {
+; CHECK-LABEL: splat_ninf_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xff80
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x bfloat> splat (bfloat 0xFFF0000000000000)
+}
+
+define <vscale x 8 x bfloat> @splat_ninf_nxv8bf16() {
+; CHECK-LABEL: splat_ninf_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xff80
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x bfloat> splat (bfloat 0xFFF0000000000000)
+}
+
+define <vscale x 2 x half> @splat_nan_nxv2f16() {
+; CHECK-LABEL: splat_nan_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7e00
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x half> splat (half 0x7FF8000000000000)
+}
+
+define <vscale x 4 x half> @splat_nan_nxv4f16() {
+; CHECK-LABEL: splat_nan_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7e00
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x half> splat (half 0x7FF8000000000000)
+}
+
+define <vscale x 8 x half> @splat_nan_nxv8f16() {
+; CHECK-LABEL: splat_nan_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7e00
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x half> splat (half 0x7FF8000000000000)
+}
+
+define <vscale x 2 x float> @splat_nan_nxv2f32() {
+; CHECK-LABEL: splat_nan_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x7fc00000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x float> splat (float 0x7FF8000000000000)
+}
+
+define <vscale x 4 x float> @splat_nan_nxv4f32() {
+; CHECK-LABEL: splat_nan_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x7fc00000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x float> splat (float 0x7FF8000000000000)
+}
+
+define <vscale x 2 x double> @splat_nan_nxv2f64() {
+; CHECK-LABEL: splat_nan_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #0x7ff8000000000000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x double> splat (double 0x7FF8000000000000)
+}
+
+define <vscale x 2 x bfloat> @splat_nan_nxv2bf16() {
+; CHECK-LABEL: splat_nan_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32704 // =0x7fc0
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x bfloat> splat (bfloat 0x7FF8000000000000)
+}
+
+define <vscale x 4 x bfloat> @splat_nan_nxv4bf16() {
+; CHECK-LABEL: splat_nan_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32704 // =0x7fc0
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x bfloat> splat (bfloat 0x7FF8000000000000)
+}
+
+define <vscale x 8 x bfloat> @splat_nan_nxv8bf16() {
+; CHECK-LABEL: splat_nan_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32704 // =0x7fc0
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x bfloat> splat (bfloat 0x7FF8000000000000)
+}
+
 define <vscale x 4 x i32> @splat_nxv4i32_fold(<vscale x 4 x i32> %x) {
 ; CHECK-LABEL: splat_nxv4i32_fold:
 ; CHECK:       // %bb.0:
@@ -581,8 +869,8 @@ define <vscale x 2 x double> @splat_nxv2f64_imm_out_of_range() {
 ; CHECK-LABEL: splat_nxv2f64_imm_out_of_range:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    adrp x8, .LCPI60_0
-; CHECK-NEXT:    add x8, x8, :lo12:.LCPI60_0
+; CHECK-NEXT:    adrp x8, .LCPI96_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI96_0
 ; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   ret <vscale x 2 x double> splat(double 3.33)
diff --git a/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll b/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll
index 6b5b3d6d436cb..b04029c273ae2 100644
--- a/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll
@@ -338,8 +338,7 @@ ret <vscale x 2 x double> %sel
 define <vscale x 8 x half> @sel_merge_nxv8f16_negative_zero(<vscale x 8 x i1> %p, <vscale x 8 x half> %in) {
 ; CHECK-LABEL: sel_merge_nxv8f16_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    dupm z1.h, #0x8000
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 8 x i1> %p, <vscale x 8 x half> splat (half -0.0), <vscale x 8 x half> %in
@@ -349,8 +348,7 @@ ret <vscale x 8 x half> %sel
 define <vscale x 4 x half> @sel_merge_nx4f16_negative_zero(<vscale x 4 x i1> %p, <vscale x 4 x half> %in) {
 ; CHECK-LABEL: sel_merge_nx4f16_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    dupm z1.h, #0x8000
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 4 x i1> %p, <vscale x 4 x half> splat (half -0.0), <vscale x 4 x half> %in
@@ -360,8 +358,7 @@ ret <vscale x 4 x half> %sel
 define <vscale x 2 x half> @sel_merge_nx2f16_negative_zero(<vscale x 2 x i1> %p, <vscale x 2 x half> %in) {
 ; CHECK-LABEL: sel_merge_nx2f16_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    dupm z1.h, #0x8000
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 2 x i1> %p, <vscale x 2 x half> splat (half -0.0), <vscale x 2 x half> %in
@@ -371,8 +368,7 @@ ret <vscale x 2 x half> %sel
 define <vscale x 4 x float> @sel_merge_nx4f32_negative_zero(<vscale x 4 x i1> %p, <vscale x 4 x float> %in) {
 ; CHECK-LABEL: sel_merge_nx4f32_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
-; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    mov z1.s, #0x80000000
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> splat (float -0.0), <vscale x 4 x float> %in
@@ -382,8 +378,7 @@ ret <vscale x 4 x float> %sel
 define <vscale x 2 x float> @sel_merge_nx2f32_negative_zero(<vscale x 2 x i1> %p, <vscale x 2 x float> %in) {
 ; CHECK-LABEL: sel_merge_nx2f32_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
-; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    mov z1.s, #0x80000000
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 2 x i1> %p, <vscale x 2 x float> splat (float -0.0), <vscale x 2 x float> %in
@@ -393,8 +388,7 @@ ret <vscale x 2 x float> %sel
 define <vscale x 2 x double> @sel_merge_nx2f64_negative_zero(<vscale x 2 x i1> %p, <vscale x 2 x double> %in) {
 ; CHECK-LABEL: sel_merge_nx2f64_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
-; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 2 x i1> %p, <vscale x 2 x double> splat (double -0.0), <vscale x 2 x double> %in
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index edfd80b4f2706..ace0c83e63c7c 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -4,20 +4,28 @@
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; AARCH-LABEL: muloti_test:
 ; AARCH:       // %bb.0: // %start
+; AARCH-NEXT:    orr x8, x1, x3
+; AARCH-NEXT:    cbz x8, .LBB0_2
+; AARCH-NEXT:  // %bb.1: // %overflow
 ; AARCH-NEXT:    mul x9, x3, x0
 ; AARCH-NEXT:    cmp x1, #0
 ; AARCH-NEXT:    ccmp x3, #0, #4, ne
-; AARCH-NEXT:    umulh x8, x1, x2
-; AARCH-NEXT:    umulh x10, x3, x0
+; AARCH-NEXT:    umulh x10, x1, x2
+; AARCH-NEXT:    umulh x8, x3, x0
 ; AARCH-NEXT:    madd x9, x1, x2, x9
-; AARCH-NEXT:    ccmp xzr, x8, #0, eq
-; AARCH-NEXT:    umulh x11, x0, x2
 ; AARCH-NEXT:    ccmp xzr, x10, #0, eq
+; AARCH-NEXT:    umulh x11, x0, x2
+; AARCH-NEXT:    ccmp xzr, x8, #0, eq
 ; AARCH-NEXT:    mul x0, x0, x2
 ; AARCH-NEXT:    cset w8, ne
 ; AARCH-NEXT:    adds x1, x11, x9
 ; AARCH-NEXT:    csinc w2, w8, wzr, lo
 ; AARCH-NEXT:    ret
+; AARCH-NEXT:  .LBB0_2: // %overflow.no
+; AARCH-NEXT:    umulh x1, x0, x2
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    mov w2, wzr
+; AARCH-NEXT:    ret
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
   %1 = extractvalue { i128, i1 } %0, 0
@@ -35,45 +43,56 @@ start:
 define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
 ; AARCH-LABEL: __muloti4:
 ; AARCH:       // %bb.0: // %Entry
-; AARCH-NEXT:    asr x11, x1, #63
-; AARCH-NEXT:    asr x9, x3, #63
-; AARCH-NEXT:    umulh x12, x0, x2
-; AARCH-NEXT:    mov x8, x1
+; AARCH-NEXT:    eor x8, x3, x2, asr #63
+; AARCH-NEXT:    eor x9, x1, x0, asr #63
 ; AARCH-NEXT:    str wzr, [x4]
-; AARCH-NEXT:    mul x13, x1, x2
-; AARCH-NEXT:    umulh x10, x1, x2
-; AARCH-NEXT:    mul x11, x11, x2
-; AARCH-NEXT:    adds x12, x13, x12
-; AARCH-NEXT:    mul x15, x0, x3
-; AARCH-NEXT:    umulh x14, x0, x3
-; AARCH-NEXT:    adc x10, x10, x11
-; AARCH-NEXT:    mul x9, x0, x9
-; AARCH-NEXT:    mul x16, x1, x3
-; AARCH-NEXT:    adds x1, x15, x12
-; AARCH-NEXT:    asr x12, x10, #63
-; AARCH-NEXT:    smulh x11, x8, x3
-; AARCH-NEXT:    adc x9, x14, x9
-; AARCH-NEXT:    asr x13, x9, #63
-; AARCH-NEXT:    adds x9, x10, x9
-; AARCH-NEXT:    asr x10, x1, #63
+; AARCH-NEXT:    orr x8, x9, x8
+; AARCH-NEXT:    cbz x8, .LBB1_2
+; AARCH-NEXT:  // %bb.1: // %overflow
+; AARCH-NEXT:    asr x9, x1, #63
+; AARCH-NEXT:    umulh x10, x0, x2
+; AARCH-NEXT:    asr x13, x3, #63
+; AARCH-NEXT:    mul x11, x1, x2
+; AARCH-NEXT:    umulh x8, x1, x2
+; AARCH-NEXT:    mul x9, x9, x2
+; AARCH-NEXT:    adds x10, x11, x10
+; AARCH-NEXT:    mul x14, x0, x3
+; AARCH-NEXT:    umulh x12, x0, x3
+; AARCH-NEXT:    adc x9, x8, x9
+; AARCH-NEXT:    mul x13, x0, x13
+; AARCH-NEXT:    adds x8, x14, x10
+; AARCH-NEXT:    mul x15, x1, x3
+; AARCH-NEXT:    smulh x10, x1, x3
+; AARCH-NEXT:    adc x11, x12, x13
+; AARCH-NEXT:    asr x12, x9, #63
+; AARCH-NEXT:    asr x13, x11, #63
+; AARCH-NEXT:    adds x9, x9, x11
+; AARCH-NEXT:    asr x11, x8, #63
 ; AARCH-NEXT:    mul x0, x0, x2
 ; AARCH-NEXT:    adc x12, x12, x13
-; AARCH-NEXT:    adds x9, x16, x9
-; AARCH-NEXT:    adc x11, x11, x12
-; AARCH-NEXT:    cmp x9, x10
-; AARCH-NEXT:    ccmp x11, x10, #0, eq
+; AARCH-NEXT:    adds x9, x15, x9
+; AARCH-NEXT:    adc x10, x10, x12
+; AARCH-NEXT:    cmp x9, x11
+; AARCH-NEXT:    ccmp x10, x11, #0, eq
 ; AARCH-NEXT:    cset w9, ne
-; AARCH-NEXT:    tbz x8, #63, .LBB1_2
-; AARCH-NEXT:  // %bb.1: // %Entry
-; AARCH-NEXT:    eor x8, x3, #0x8000000000000000
-; AARCH-NEXT:    orr x8, x2, x8
-; AARCH-NEXT:    cbz x8, .LBB1_3
-; AARCH-NEXT:  .LBB1_2: // %Else2
-; AARCH-NEXT:    cbz w9, .LBB1_4
-; AARCH-NEXT:  .LBB1_3: // %Then7
-; AARCH-NEXT:    mov w8, #1 // =0x1
-; AARCH-NEXT:    str w8, [x4]
-; AARCH-NEXT:  .LBB1_4: // %Block9
+; AARCH-NEXT:    tbnz x1, #63, .LBB1_3
+; AARCH-NEXT:    b .LBB1_4
+; AARCH-NEXT:  .LBB1_2: // %overflow.no
+; AARCH-NEXT:    smulh x8, x0, x2
+; AARCH-NEXT:    mov w9, wzr
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    tbz x1, #63, .LBB1_4
+; AARCH-NEXT:  .LBB1_3: // %overflow.res
+; AARCH-NEXT:    eor x10, x3, #0x8000000000000000
+; AARCH-NEXT:    orr x10, x2, x10
+; AARCH-NEXT:    cbz x10, .LBB1_5
+; AARCH-NEXT:  .LBB1_4: // %Else2
+; AARCH-NEXT:    cbz w9, .LBB1_6
+; AARCH-NEXT:  .LBB1_5: // %Then7
+; AARCH-NEXT:    mov w9, #1 // =0x1
+; AARCH-NEXT:    str w9, [x4]
+; AARCH-NEXT:  .LBB1_6: // %Block9
+; AARCH-NEXT:    mov x1, x8
 ; AARCH-NEXT:    ret
 Entry:
   store i32 0, ptr %2, align 4
diff --git a/llvm/test/CodeGen/DirectX/ddx_coarse-errors.ll b/llvm/test/CodeGen/DirectX/ddx_coarse-errors.ll
new file mode 100644
index 0000000000000..0679eec31cec1
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ddx_coarse-errors.ll
@@ -0,0 +1,15 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; DXIL operation ddx.coarse does not support double overload type
+; CHECK: in function ddx.coarse
+; CHECK-SAME: Cannot create DerivCoarseX operation: Invalid overload type
+
+; Function Attrs: noinline nounwind optnone
+define noundef double @ddx.coarse_double(double noundef %a) #0 {
+entry:
+  %a.addr = alloca double, align 8
+  store double %a, ptr %a.addr, align 8
+  %0 = load double, ptr %a.addr, align 8
+  %dx.ddx.coarse = call double @llvm.dx.ddx.coarse.f64(double %0)
+  ret double %dx.ddx.coarse
+}
diff --git a/llvm/test/CodeGen/DirectX/ddx_coarse.ll b/llvm/test/CodeGen/DirectX/ddx_coarse.ll
new file mode 100644
index 0000000000000..f6ea031273263
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ddx_coarse.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S  -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+; Make sure dxil operation function calls for ddx_coarse are generated for half/float and matching vectors
+
+define noundef half @deriv_coarse_x_half(half noundef %a) {
+; CHECK: call half @dx.op.unary.f16(i32 83, half %{{.*}})
+entry:
+  %dx.ddx.coarse = call half @llvm.dx.ddx.coarse.f16(half %a)
+  ret half %dx.ddx.coarse
+}
+
+define noundef float @deriv_coarse_x_float(float noundef %a) {
+; CHECK: call float @dx.op.unary.f32(i32 83, float %{{.*}})
+entry:
+  %dx.ddx.coarse = call float @llvm.dx.ddx.coarse.f32(float %a)
+  ret float %dx.ddx.coarse
+}
+
+define noundef <4 x float> @deriv_coarse_x_float4(<4 x float> noundef %a) {
+; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 83, float [[ee0]])
+; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 83, float [[ee1]])
+; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 83, float [[ee2]])
+; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 83, float [[ee3]])
+; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+; CHECK: ret <4 x float> %{{.*}}
+entry:
+  %dx.ddx.coarse = call <4 x float> @llvm.dx.ddx.coarse.v4f32(<4 x float> %a)
+  ret <4 x float> %dx.ddx.coarse
+}
+
+declare half @llvm.dx.ddx.coarse.f16(half)
+declare float @llvm.dx.ddx.coarse.f32(float)
+declare <4 x float> @llvm.dx.ddx.coarse.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/ddy_coarse-errors.ll b/llvm/test/CodeGen/DirectX/ddy_coarse-errors.ll
new file mode 100644
index 0000000000000..df8e3eb0f7e0b
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ddy_coarse-errors.ll
@@ -0,0 +1,15 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; DXIL operation ddy.coarse does not support double overload type
+; CHECK: in function ddy.coarse
+; CHECK-SAME: Cannot create DerivCoarseY operation: Invalid overload type
+
+; Function Attrs: noinline nounwind optnone
+define noundef double @ddy.coarse_double(double noundef %a) #0 {
+entry:
+  %a.addr = alloca double, align 8
+  store double %a, ptr %a.addr, align 8
+  %0 = load double, ptr %a.addr, align 8
+  %dx.ddy.coarse = call double @llvm.dx.ddy.coarse.f64(double %0)
+  ret double %dx.ddy.coarse
+}
diff --git a/llvm/test/CodeGen/DirectX/ddy_coarse.ll b/llvm/test/CodeGen/DirectX/ddy_coarse.ll
new file mode 100644
index 0000000000000..e3337022e1b01
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ddy_coarse.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S  -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+; Make sure dxil operation function calls for ddy_coarse are generated for half/float and matching vectors
+
+define noundef half @deriv_coarse_y_half(half noundef %a) {
+; CHECK: call half @dx.op.unary.f16(i32 84, half %{{.*}})
+entry:
+  %dx.ddy.coarse = call half @llvm.dx.ddy.coarse.f16(half %a)
+  ret half %dx.ddy.coarse
+}
+
+define noundef float @deriv_coarse_y_float(float noundef %a) {
+; CHECK: call float @dx.op.unary.f32(i32 84, float %{{.*}})
+entry:
+  %dx.ddy.coarse = call float @llvm.dx.ddy.coarse.f32(float %a)
+  ret float %dx.ddy.coarse
+}
+
+define noundef <4 x float> @deriv_coarse_y_float4(<4 x float> noundef %a) {
+; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 84, float [[ee0]])
+; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 84, float [[ee1]])
+; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 84, float [[ee2]])
+; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 84, float [[ee3]])
+; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+; CHECK: ret <4 x float> %{{.*}}
+entry:
+  %dx.ddy.coarse = call <4 x float> @llvm.dx.ddy.coarse.v4f32(<4 x float> %a)
+  ret <4 x float> %dx.ddy.coarse
+}
+
+declare half @llvm.dx.ddy.coarse.f16(half)
+declare float @llvm.dx.ddy.coarse.f32(float)
+declare <4 x float> @llvm.dx.ddy.coarse.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddx_coarse.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddx_coarse.ll
new file mode 100644
index 0000000000000..478acb53701ea
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddx_coarse.ll
@@ -0,0 +1,47 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val --target-env spv1.4 %}
+
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+
+define noundef float @ddx_coarse_float(float noundef %a) {
+entry:
+; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
+; CHECK: %[[#]] = OpDPdxCoarse %[[#float_32]] %[[#float_32_arg]]
+  %elt.ddx.coarse = call float @llvm.spv.ddx.coarse.f32(float %a)
+  ret float %elt.ddx.coarse
+}
+
+define noundef half @ddx_coarse_half(half noundef %a) {
+entry:
+; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
+; CHECK: %[[#converted:]] = OpFConvert %[[#float_32:]] %[[#float_16_arg]]
+; CHECK: %[[#coarse:]] = OpDPdxCoarse %[[#float_32]] %[[#converted]]
+; CHECK: %[[#]] = OpFConvert %[[#float_16]] %[[#coarse]]
+  %elt.ddx.coarse = call half @llvm.spv.ddx.coarse.f16(half %a)
+  ret half %elt.ddx.coarse
+}
+
+define noundef <4 x float> @ddx_coarse_float_vector(<4 x float> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
+; CHECK: %[[#]] = OpDPdxCoarse %[[#vec4_float_32]] %[[#vec4_float_32_arg]]
+  %elt.ddx.coarse = call <4 x float> @llvm.spv.ddx.coarse.v4f32(<4 x float> %a)
+  ret <4 x float> %elt.ddx.coarse
+}
+
+define noundef <4 x half> @ddx_coarse_half_vector(<4 x half> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
+; CHECK: %[[#converted:]] = OpFConvert %[[#vec4_float_32:]] %[[#vec4_float_16_arg]]
+; CHECK: %[[#coarse:]] = OpDPdxCoarse %[[#vec4_float_32]] %[[#converted]]
+; CHECK: %[[#]] = OpFConvert %[[#vec4_float_16]] %[[#coarse]]
+  %elt.ddx.coarse = call <4 x half> @llvm.spv.ddx.coarse.v4f16(<4 x half> %a)
+  ret <4 x half> %elt.ddx.coarse
+}
+
+declare float @llvm.spv.ddx.coarse.f32(float)
+declare half @llvm.spv.ddx.coarse.f16(half)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddy_coarse.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddy_coarse.ll
new file mode 100644
index 0000000000000..8ad67cb644aa7
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddy_coarse.ll
@@ -0,0 +1,47 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val --target-env spv1.4 %}
+
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+
+define noundef float @ddy_coarse_float(float noundef %a) {
+entry:
+; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
+; CHECK: %[[#]] = OpDPdyCoarse %[[#float_32]] %[[#float_32_arg]]
+  %elt.ddy.coarse = call float @llvm.spv.ddy.coarse.f32(float %a)
+  ret float %elt.ddy.coarse
+}
+
+define noundef half @ddy_coarse_half(half noundef %a) {
+entry:
+; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
+; CHECK: %[[#converted:]] = OpFConvert %[[#float_32:]] %[[#float_16_arg]]
+; CHECK: %[[#coarse:]] = OpDPdyCoarse %[[#float_32]] %[[#converted]]
+; CHECK: %[[#]] = OpFConvert %[[#float_16]] %[[#coarse]]
+  %elt.ddy.coarse = call half @llvm.spv.ddy.coarse.f16(half %a)
+  ret half %elt.ddy.coarse
+}
+
+define noundef <4 x float> @ddy_coarse_float_vector(<4 x float> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
+; CHECK: %[[#]] = OpDPdyCoarse %[[#vec4_float_32]] %[[#vec4_float_32_arg]]
+  %elt.ddy.coarse = call <4 x float> @llvm.spv.ddy.coarse.v4f32(<4 x float> %a)
+  ret <4 x float> %elt.ddy.coarse
+}
+
+define noundef <4 x half> @ddy_coarse_half_vector(<4 x half> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
+; CHECK: %[[#converted:]] = OpFConvert %[[#vec4_float_32:]] %[[#vec4_float_16_arg]]
+; CHECK: %[[#coarse:]] = OpDPdyCoarse %[[#vec4_float_32]] %[[#converted]]
+; CHECK: %[[#]] = OpFConvert %[[#vec4_float_16]] %[[#coarse]]
+  %elt.ddy.coarse = call <4 x half> @llvm.spv.ddy.coarse.v4f16(<4 x half> %a)
+  ret <4 x half> %elt.ddy.coarse
+}
+
+declare float @llvm.spv.ddy.coarse.f32(float)
+declare half @llvm.spv.ddy.coarse.f16(half)
diff --git a/llvm/test/CodeGen/SPIRV/opencl/ddx_coarse-error.ll b/llvm/test/CodeGen/SPIRV/opencl/ddx_coarse-error.ll
new file mode 100644
index 0000000000000..e93c1d1ba4d36
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/ddx_coarse-error.ll
@@ -0,0 +1,12 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: %{{.*}} = G_INTRINSIC intrinsic(@llvm.spv.ddx.coarse), %{{.*}} is only supported in shaders.
+
+define noundef float @ddx_coarse(float noundef %a) {
+entry:
+  %spv.ddx.coarse = call float @llvm.spv.ddx.coarse.f32(float %a)
+  ret float %spv.ddx.coarse
+}
+
+declare float @llvm.spv.ddx.coarse.f32(float)
diff --git a/llvm/test/CodeGen/SPIRV/opencl/ddy_coarse-error.ll b/llvm/test/CodeGen/SPIRV/opencl/ddy_coarse-error.ll
new file mode 100644
index 0000000000000..aa71a395d8680
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/ddy_coarse-error.ll
@@ -0,0 +1,12 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: %{{.*}} = G_INTRINSIC intrinsic(@llvm.spv.ddy.coarse), %{{.*}} is only supported in shaders.
+
+define noundef float @ddy_coarse(float noundef %a) {
+entry:
+  %spv.ddy.coarse = call float @llvm.spv.ddy.coarse.f32(float %a)
+  ret float %spv.ddy.coarse
+}
+
+declare float @llvm.spv.ddy.coarse.f32(float)
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index b85a20b9d6b6e..023fb5065b892 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1877,85 +1877,56 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushq %r15
 ; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq 48(%rdi), %rdx
-; SSE-NEXT:    movq 40(%rdi), %rsi
-; SSE-NEXT:    movq 32(%rdi), %r11
+; SSE-NEXT:    movq 48(%rdi), %r11
+; SSE-NEXT:    movq 40(%rdi), %r9
 ; SSE-NEXT:    movq 24(%rdi), %r8
-; SSE-NEXT:    movq 16(%rdi), %r9
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %r10
-; SSE-NEXT:    rep bsfq %rax, %rbx
-; SSE-NEXT:    rep bsfq %r10, %r14
-; SSE-NEXT:    addq $64, %r14
-; SSE-NEXT:    testq %rax, %rax
-; SSE-NEXT:    cmovneq %rbx, %r14
-; SSE-NEXT:    rep bsfq %r9, %r15
-; SSE-NEXT:    rep bsfq %r8, %rbx
+; SSE-NEXT:    movq 16(%rdi), %rdx
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    movq 8(%rdi), %rsi
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    rep bsfq %rsi, %rbx
 ; SSE-NEXT:    addq $64, %rbx
-; SSE-NEXT:    testq %r9, %r9
-; SSE-NEXT:    cmovneq %r15, %rbx
-; SSE-NEXT:    subq $-128, %rbx
-; SSE-NEXT:    movq %rax, %r15
-; SSE-NEXT:    movq %rax, %r12
-; SSE-NEXT:    orq %r10, %r12
-; SSE-NEXT:    cmovneq %r14, %rbx
-; SSE-NEXT:    rep bsfq %r11, %r12
-; SSE-NEXT:    rep bsfq %rsi, %r14
-; SSE-NEXT:    addq $64, %r14
-; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovneq %r12, %r14
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    rep bsfq %rdx, %r12
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovneq %rax, %rbx
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    rep bsfq %r8, %r10
+; SSE-NEXT:    addq $64, %r10
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovneq %rax, %r10
+; SSE-NEXT:    movq 32(%rdi), %r14
+; SSE-NEXT:    subq $-128, %r10
+; SSE-NEXT:    movq %rcx, %rax
+; SSE-NEXT:    orq %rsi, %rax
+; SSE-NEXT:    cmovneq %rbx, %r10
+; SSE-NEXT:    rep bsfq %r14, %rax
+; SSE-NEXT:    rep bsfq %r9, %rbx
+; SSE-NEXT:    addq $64, %rbx
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovneq %rax, %rbx
+; SSE-NEXT:    rep bsfq %r11, %r15
 ; SSE-NEXT:    movl $64, %eax
-; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    rep bsfq 56(%rdi), %rax
 ; SSE-NEXT:    addq $64, %rax
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovneq %r12, %rax
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovneq %r15, %rax
 ; SSE-NEXT:    subq $-128, %rax
-; SSE-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    orq %rsi, %r11
-; SSE-NEXT:    cmovneq %r14, %rax
-; SSE-NEXT:    addq $256, %rax # imm = 0x100
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    orq %r8, %r10
-; SSE-NEXT:    orq %r9, %r15
-; SSE-NEXT:    orq %r10, %r15
+; SSE-NEXT:    orq %r9, %r14
 ; SSE-NEXT:    cmovneq %rbx, %rax
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    addq $256, %rax # imm = 0x100
+; SSE-NEXT:    orq %r8, %rsi
+; SSE-NEXT:    orq %rdx, %rcx
+; SSE-NEXT:    orq %rsi, %rcx
+; SSE-NEXT:    cmovneq %r10, %rax
+; SSE-NEXT:    movl $-2, %edx
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    roll %cl, %edx
 ; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    andl $32, %ecx
-; SSE-NEXT:    movl %eax, %edx
-; SSE-NEXT:    andl $480, %edx # imm = 0x1E0
-; SSE-NEXT:    shrl $3, %edx
-; SSE-NEXT:    movl %edx, %esi
-; SSE-NEXT:    andl $-8, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %r8
-; SSE-NEXT:    shrq %cl, %r8
-; SSE-NEXT:    movl -120(%rsp,%rsi), %esi
-; SSE-NEXT:    addl %esi, %esi
-; SSE-NEXT:    notl %ecx
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rsi
-; SSE-NEXT:    orl %r8d, %esi
-; SSE-NEXT:    btrl %eax, %esi
-; SSE-NEXT:    movl %esi, (%rdi,%rdx)
+; SSE-NEXT:    shrl $3, %ecx
+; SSE-NEXT:    andl $60, %ecx
+; SSE-NEXT:    andl %edx, (%rdi,%rcx)
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    addq $8, %rsp
 ; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
 ; SSE-NEXT:    popq %r14
 ; SSE-NEXT:    popq %r15
 ; SSE-NEXT:    retq
@@ -1964,133 +1935,86 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %r15
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 56(%rdi), %rcx
-; AVX2-NEXT:    movq 40(%rdi), %rdx
-; AVX2-NEXT:    movq 32(%rdi), %r11
-; AVX2-NEXT:    movq 24(%rdi), %rsi
-; AVX2-NEXT:    movq 16(%rdi), %r8
-; AVX2-NEXT:    movq (%rdi), %r9
-; AVX2-NEXT:    movq 8(%rdi), %r10
-; AVX2-NEXT:    xorl %ebx, %ebx
-; AVX2-NEXT:    tzcntq %r9, %rbx
-; AVX2-NEXT:    tzcntq %r10, %rax
-; AVX2-NEXT:    addq $64, %rax
-; AVX2-NEXT:    testq %r9, %r9
-; AVX2-NEXT:    cmovneq %rbx, %rax
-; AVX2-NEXT:    xorl %r14d, %r14d
-; AVX2-NEXT:    tzcntq %r8, %r14
+; AVX2-NEXT:    movq 40(%rdi), %r9
+; AVX2-NEXT:    movq 32(%rdi), %r10
+; AVX2-NEXT:    movq 24(%rdi), %r8
+; AVX2-NEXT:    movq 16(%rdi), %rdx
+; AVX2-NEXT:    movq (%rdi), %rcx
+; AVX2-NEXT:    movq 8(%rdi), %rsi
+; AVX2-NEXT:    tzcntq %rcx, %rax
 ; AVX2-NEXT:    xorl %ebx, %ebx
 ; AVX2-NEXT:    tzcntq %rsi, %rbx
 ; AVX2-NEXT:    addq $64, %rbx
-; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    cmovneq %r14, %rbx
-; AVX2-NEXT:    subq $-128, %rbx
-; AVX2-NEXT:    movq %r9, %r14
-; AVX2-NEXT:    movq %r9, %r15
-; AVX2-NEXT:    orq %r10, %r15
+; AVX2-NEXT:    testq %rcx, %rcx
 ; AVX2-NEXT:    cmovneq %rax, %rbx
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %r11, %rax
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    tzcntq %rdx, %r12
-; AVX2-NEXT:    addq $64, %r12
-; AVX2-NEXT:    testq %r11, %r11
-; AVX2-NEXT:    cmovneq %rax, %r12
-; AVX2-NEXT:    movq 48(%rdi), %r15
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    tzcntq %r15, %r13
+; AVX2-NEXT:    tzcntq %rdx, %rax
+; AVX2-NEXT:    tzcntq %r8, %r11
+; AVX2-NEXT:    addq $64, %r11
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovneq %rax, %r11
+; AVX2-NEXT:    subq $-128, %r11
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    orq %rsi, %rax
+; AVX2-NEXT:    cmovneq %rbx, %r11
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    tzcntq %r10, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %r9, %rbx
+; AVX2-NEXT:    addq $64, %rbx
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovneq %rax, %rbx
+; AVX2-NEXT:    movq 48(%rdi), %r14
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %r14, %r15
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq 56(%rdi), %rax
 ; AVX2-NEXT:    addq $64, %rax
-; AVX2-NEXT:    testq %r15, %r15
-; AVX2-NEXT:    cmovneq %r13, %rax
+; AVX2-NEXT:    testq %r14, %r14
+; AVX2-NEXT:    cmovneq %r15, %rax
 ; AVX2-NEXT:    subq $-128, %rax
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    orq %rdx, %r11
-; AVX2-NEXT:    cmovneq %r12, %rax
-; AVX2-NEXT:    addq $256, %rax # imm = 0x100
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    orq %rsi, %r10
-; AVX2-NEXT:    orq %r8, %r14
-; AVX2-NEXT:    orq %r10, %r14
+; AVX2-NEXT:    orq %r9, %r10
 ; AVX2-NEXT:    cmovneq %rbx, %rax
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r15, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    addq $256, %rax # imm = 0x100
+; AVX2-NEXT:    orq %r8, %rsi
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    cmovneq %r11, %rax
+; AVX2-NEXT:    movl $-2, %edx
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    roll %cl, %edx
 ; AVX2-NEXT:    movl %eax, %ecx
-; AVX2-NEXT:    andl $32, %ecx
-; AVX2-NEXT:    movl %eax, %edx
-; AVX2-NEXT:    andl $480, %edx # imm = 0x1E0
-; AVX2-NEXT:    shrl $3, %edx
-; AVX2-NEXT:    movl %edx, %esi
-; AVX2-NEXT:    andl $-8, %esi
-; AVX2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r8
-; AVX2-NEXT:    notl %ecx
-; AVX2-NEXT:    movl -120(%rsp,%rsi), %esi
-; AVX2-NEXT:    addl %esi, %esi
-; AVX2-NEXT:    shlxq %rcx, %rsi, %rcx
-; AVX2-NEXT:    orl %r8d, %ecx
-; AVX2-NEXT:    btrl %eax, %ecx
-; AVX2-NEXT:    movl %ecx, (%rdi,%rdx)
+; AVX2-NEXT:    shrl $3, %ecx
+; AVX2-NEXT:    andl $60, %ecx
+; AVX2-NEXT:    andl %edx, (%rdi,%rcx)
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: blsr_u512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm2
-; AVX512-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
-; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm3
-; AVX512-NEXT:    vpandnq %zmm3, %zmm2, %zmm3
-; AVX512-NEXT:    vplzcntq %zmm3, %zmm3
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
-; AVX512-NEXT:    vpsubq %zmm3, %zmm4, %zmm3
-; AVX512-NEXT:    vptestmq %zmm2, %zmm2, %k1
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [512,512,512,512,512,512,512,512]
-; AVX512-NEXT:    vpcompressq %zmm3, %zmm2 {%k1}
-; AVX512-NEXT:    vmovq %xmm2, %rax
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovdqu %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovdqu %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    movl $-2, %edx
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    roll %cl, %edx
 ; AVX512-NEXT:    movl %eax, %ecx
-; AVX512-NEXT:    andl $32, %ecx
-; AVX512-NEXT:    movl %ecx, %edx
-; AVX512-NEXT:    notl %edx
-; AVX512-NEXT:    movl %eax, %esi
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    movl %esi, %r8d
-; AVX512-NEXT:    andl $56, %r8d
-; AVX512-NEXT:    movl -120(%rsp,%r8), %r9d
-; AVX512-NEXT:    addl %r9d, %r9d
-; AVX512-NEXT:    shlxq %rdx, %r9, %rdx
 ; AVX512-NEXT:    shrl $3, %ecx
-; AVX512-NEXT:    addq %rsp, %r8
-; AVX512-NEXT:    addq $-128, %r8
-; AVX512-NEXT:    orl (%rcx,%r8), %edx
-; AVX512-NEXT:    btrl %eax, %edx
-; AVX512-NEXT:    andl $60, %esi
-; AVX512-NEXT:    movl %edx, (%rdi,%rsi)
+; AVX512-NEXT:    andl $60, %ecx
+; AVX512-NEXT:    andl %edx, (%rdi,%rcx)
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rcx
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %ld = load i512, ptr %word
diff --git a/llvm/test/Transforms/InstSimplify/AArch64/aarch64-sve-reductions.ll b/llvm/test/Transforms/InstSimplify/AArch64/aarch64-sve-reductions.ll
new file mode 100644
index 0000000000000..a54d6044d04b1
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/AArch64/aarch64-sve-reductions.ll
@@ -0,0 +1,912 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; ANDV
+;
+
+define i8 @andv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @andv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret i8 -1
+;
+  %out = call i8 @llvm.aarch64.sve.andv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @andv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @andv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 -1
+;
+  %out = call i8 @llvm.aarch64.sve.andv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 -1))
+  ret i8 %out
+}
+
+define i8 @andv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @andv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.andv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.andv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @andv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @andv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.andv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @andv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @andv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 -1
+;
+  %out = call i16 @llvm.aarch64.sve.andv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 -1))
+  ret i16 %out
+}
+
+define i16 @andv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @andv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.andv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.andv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i32 @andv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @andv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 -1
+;
+  %out = call i32 @llvm.aarch64.sve.andv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 -1))
+  ret i32 %out
+}
+
+define i32 @andv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @andv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.andv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.andv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i64 @andv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @andv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 -1
+;
+  %out = call i64 @llvm.aarch64.sve.andv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 -1))
+  ret i64 %out
+}
+
+define i64 @andv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @andv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.andv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.andv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+;
+; EORV
+;
+
+define i8 @eorv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @eorv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.eorv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @eorv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @eorv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.eorv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @eorv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @eorv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.eorv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> splat (i8 1))
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.eorv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 1))
+  ret i8 %out
+}
+
+define i8 @eorv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @eorv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.eorv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @eorv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @eorv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 0
+;
+  %out = call i16 @llvm.aarch64.sve.eorv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i16 @eorv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @eorv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.eorv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1))
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.eorv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 1))
+  ret i16 %out
+}
+
+define i32 @eorv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @eorv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 0
+;
+  %out = call i32 @llvm.aarch64.sve.eorv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i32 @eorv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @eorv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.eorv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.eorv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 1))
+  ret i32 %out
+}
+
+define i64 @eorv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @eorv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.eorv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @eorv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @eorv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.eorv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.eorv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 1))
+  ret i64 %out
+}
+
+;
+; ORV
+;
+
+define i8 @orv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @orv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.orv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @orv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @orv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.orv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @orv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @orv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.orv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> splat (i8 1))
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.orv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 1))
+  ret i8 %out
+}
+
+define i8 @orv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @orv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.orv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @orv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @orv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 0
+;
+  %out = call i16 @llvm.aarch64.sve.orv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i16 @orv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @orv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.orv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1))
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.orv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 1))
+  ret i16 %out
+}
+
+define i32 @orv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @orv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 0
+;
+  %out = call i32 @llvm.aarch64.sve.orv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i32 @orv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @orv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.orv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.orv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 1))
+  ret i32 %out
+}
+
+define i64 @orv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @orv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.orv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @orv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @orv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.orv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.orv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 1))
+  ret i64 %out
+}
+
+;
+; SADDV
+;
+
+define i64 @saddv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i64 @saddv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i64 %out
+}
+
+define i64 @saddv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @saddv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> splat (i8 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 1))
+  ret i64 %out
+}
+
+define i64 @saddv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i64 @saddv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A_INSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i8 0
+; CHECK-NEXT:    [[A_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[A_INSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A_SPLAT]])
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i64 %out
+}
+
+define i64 @saddv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @saddv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.saddv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 1))
+  ret i64 %out
+}
+
+define i64 @saddv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @saddv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.saddv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 1))
+  ret i64 %out
+}
+
+define i64 @saddv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @saddv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.saddv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 1))
+  ret i64 %out
+}
+
+;
+; SMAXV
+;
+
+define i8 @smaxv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @smaxv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 -128
+;
+  %out = call i8 @llvm.aarch64.sve.smaxv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @smaxv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @smaxv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 -128
+;
+  %out = call i8 @llvm.aarch64.sve.smaxv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 -128))
+  ret i8 %out
+}
+
+define i8 @smaxv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @smaxv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.smaxv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.smaxv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @smaxv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @smaxv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.smaxv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @smaxv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @smaxv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 -32768
+;
+  %out = call i16 @llvm.aarch64.sve.smaxv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 -32768))
+  ret i16 %out
+}
+
+define i16 @smaxv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @smaxv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.smaxv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.smaxv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i32 @smaxv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @smaxv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 -2147483648
+;
+  %out = call i32 @llvm.aarch64.sve.smaxv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 -2147483648))
+  ret i32 %out
+}
+
+define i32 @smaxv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @smaxv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.smaxv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.smaxv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i64 @smaxv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @smaxv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 -9223372036854775808
+;
+  %out = call i64 @llvm.aarch64.sve.smaxv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 -9223372036854775808))
+  ret i64 %out
+}
+
+define i64 @smaxv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @smaxv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.smaxv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.smaxv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+;
+; SMINV
+;
+
+define i8 @sminv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @sminv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 127
+;
+  %out = call i8 @llvm.aarch64.sve.sminv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @sminv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @sminv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 127
+;
+  %out = call i8 @llvm.aarch64.sve.sminv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 127))
+  ret i8 %out
+}
+
+define i8 @sminv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @sminv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.sminv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.sminv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @sminv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @sminv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.sminv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @sminv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @sminv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 32767
+;
+  %out = call i16 @llvm.aarch64.sve.sminv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 32767))
+  ret i16 %out
+}
+
+define i16 @sminv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @sminv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.sminv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.sminv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i32 @sminv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @sminv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 2147483647
+;
+  %out = call i32 @llvm.aarch64.sve.sminv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 2147483647))
+  ret i32 %out
+}
+
+define i32 @sminv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @sminv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.sminv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.sminv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i64 @sminv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @sminv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 9223372036854775807
+;
+  %out = call i64 @llvm.aarch64.sve.sminv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 9223372036854775807))
+  ret i64 %out
+}
+
+define i64 @sminv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @sminv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.sminv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.sminv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+;
+; UADDV
+;
+
+define i64 @uaddv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i64 @uaddv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i64 %out
+}
+
+define i64 @uaddv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @uaddv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> splat (i8 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 1))
+  ret i64 %out
+}
+
+define i64 @uaddv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i64 @uaddv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A_INSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i8 0
+; CHECK-NEXT:    [[A_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[A_INSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A_SPLAT]])
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i64 %out
+}
+
+define i64 @uaddv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @uaddv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 1))
+  ret i64 %out
+}
+
+define i64 @uaddv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @uaddv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 1))
+  ret i64 %out
+}
+
+define i64 @uaddv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @uaddv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 1))
+  ret i64 %out
+}
+
+;
+; UMAXV
+;
+
+define i8 @umaxv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @umaxv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.umaxv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @umaxv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @umaxv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.umaxv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @umaxv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @umaxv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.umaxv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> splat (i8 1))
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.umaxv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 1))
+  ret i8 %out
+}
+
+define i8 @umaxv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @umaxv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.umaxv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @umaxv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @umaxv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 0
+;
+  %out = call i16 @llvm.aarch64.sve.umaxv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i16 @umaxv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @umaxv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.umaxv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1))
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.umaxv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 1))
+  ret i16 %out
+}
+
+define i32 @umaxv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @umaxv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 0
+;
+  %out = call i32 @llvm.aarch64.sve.umaxv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i32 @umaxv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @umaxv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.umaxv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.umaxv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 1))
+  ret i32 %out
+}
+
+define i64 @umaxv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @umaxv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.umaxv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @umaxv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @umaxv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.umaxv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.umaxv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 1))
+  ret i64 %out
+}
+
+;
+; UMINV
+;
+
+define i8 @uminv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @uminv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 -1
+;
+  %out = call i8 @llvm.aarch64.sve.uminv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @uminv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @uminv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 -1
+;
+  %out = call i8 @llvm.aarch64.sve.uminv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 -1))
+  ret i8 %out
+}
+
+define i8 @uminv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @uminv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.uminv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.uminv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @uminv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @uminv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.uminv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @uminv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @uminv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 -1
+;
+  %out = call i16 @llvm.aarch64.sve.uminv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 -1))
+  ret i16 %out
+}
+
+define i16 @uminv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @uminv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.uminv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.uminv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i32 @uminv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @uminv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 -1
+;
+  %out = call i32 @llvm.aarch64.sve.uminv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 -1))
+  ret i32 %out
+}
+
+define i32 @uminv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @uminv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.uminv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.uminv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i64 @uminv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uminv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 -1
+;
+  %out = call i64 @llvm.aarch64.sve.uminv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 -1))
+  ret i64 %out
+}
+
+define i64 @uminv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uminv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uminv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.uminv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/InstSimplify/AArch64/lit.local.cfg b/llvm/test/Transforms/InstSimplify/AArch64/lit.local.cfg
new file mode 100644
index 0000000000000..10d4a0e953ed4
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AArch64" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
index 20676f3702294..10c265519952b 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
@@ -14,23 +14,23 @@ define void @foo(i64 %n) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: outer.header:
 ; CHECK-NEXT:   EMIT-SCALAR ir<%outer.iv> = phi [ ir<%outer.iv.next>, outer.latch ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:   EMIT ir<%gep.1> = getelementptr ir<@arr2>, ir<0>, ir<%outer.iv>
+; CHECK-NEXT:   EMIT ir<%gep.1> = getelementptr inbounds ir<@arr2>, ir<0>, ir<%outer.iv>
 ; CHECK-NEXT:   EMIT store ir<%outer.iv>, ir<%gep.1>
-; CHECK-NEXT:   EMIT ir<%add> = add ir<%outer.iv>, ir<%n>
+; CHECK-NEXT:   EMIT ir<%add> = add nsw ir<%outer.iv>, ir<%n>
 ; CHECK-NEXT: Successor(s): inner
 ; CHECK-EMPTY:
 ; CHECK-NEXT: inner:
 ; CHECK-NEXT:   EMIT-SCALAR ir<%inner.iv> = phi [ ir<%inner.iv.next>, inner ], [ ir<0>, outer.header ]
-; CHECK-NEXT:   EMIT ir<%gep.2> = getelementptr ir<@arr>, ir<0>, ir<%inner.iv>, ir<%outer.iv>
+; CHECK-NEXT:   EMIT ir<%gep.2> = getelementptr inbounds ir<@arr>, ir<0>, ir<%inner.iv>, ir<%outer.iv>
 ; CHECK-NEXT:   EMIT store ir<%add>, ir<%gep.2>
-; CHECK-NEXT:   EMIT ir<%inner.iv.next> = add ir<%inner.iv>, ir<1>
-; CHECK-NEXT:   EMIT ir<%inner.ec> = icmp ir<%inner.iv.next>, ir<8>
+; CHECK-NEXT:   EMIT ir<%inner.iv.next> = add nuw nsw ir<%inner.iv>, ir<1>
+; CHECK-NEXT:   EMIT ir<%inner.ec> = icmp eq ir<%inner.iv.next>, ir<8>
 ; CHECK-NEXT:   EMIT branch-on-cond ir<%inner.ec>
 ; CHECK-NEXT: Successor(s): outer.latch, inner
 ; CHECK-EMPTY:
 ; CHECK-NEXT: outer.latch:
-; CHECK-NEXT:   EMIT ir<%outer.iv.next> = add ir<%outer.iv>, ir<1>
-; CHECK-NEXT:   EMIT ir<%outer.ec> = icmp ir<%outer.iv.next>, ir<8>
+; CHECK-NEXT:   EMIT ir<%outer.iv.next> = add nuw nsw ir<%outer.iv>, ir<1>
+; CHECK-NEXT:   EMIT ir<%outer.ec> = icmp eq ir<%outer.iv.next>, ir<8>
 ; CHECK-NEXT:   EMIT branch-on-cond ir<%outer.ec>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, outer.header
 ; CHECK-EMPTY:
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index b99d656c5c50f..5742df2aa3c53 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -139,12 +139,12 @@ compound=true
       "vector.body:\l" +
       "  EMIT vp\<%2\> = CANONICAL-INDUCTION ir\<0\>, vp\<%index.next\>\l" +
       "  EMIT-SCALAR ir\<%indvars.iv\> = phi [ ir\<0\>, vector.ph ], [ ir\<%indvars.iv.next\>, vector.body ]\l" +
-      "  EMIT ir\<%arr.idx\> = getelementptr ir\<%A\>, ir\<%indvars.iv\>\l" +
+      "  EMIT ir\<%arr.idx\> = getelementptr inbounds ir\<%A\>, ir\<%indvars.iv\>\l" +
       "  EMIT ir\<%l1\> = load ir\<%arr.idx\>\l" +
       "  EMIT ir\<%res\> = add ir\<%l1\>, ir\<10\>\l" +
       "  EMIT store ir\<%res\>, ir\<%arr.idx\>\l" +
       "  EMIT ir\<%indvars.iv.next\> = add ir\<%indvars.iv\>, ir\<1\>\l" +
-      "  EMIT ir\<%exitcond\> = icmp ir\<%indvars.iv.next\>, ir\<%N\>\l" +
+      "  EMIT ir\<%exitcond\> = icmp ne ir\<%indvars.iv.next\>, ir\<%N\>\l" +
       "  EMIT vp\<%3\> = not ir\<%exitcond\>\l" +
       "  EMIT vp\<%index.next\> = add nuw vp\<%2\>, vp\<%0\>\l" +
       "  EMIT branch-on-count vp\<%index.next\>, vp\<%1\>\l" +
@@ -305,9 +305,9 @@ compound=true
       "vector.body:\l" +
       "  EMIT vp\<%2\> = CANONICAL-INDUCTION ir\<0\>, vp\<%index.next\>\l" +
       "  EMIT-SCALAR ir\<%iv\> = phi [ ir\<0\>, vector.ph ], [ ir\<%iv.next\>, loop.latch ]\l" +
-      "  EMIT ir\<%arr.idx\> = getelementptr ir\<%A\>, ir\<%iv\>\l" +
+      "  EMIT ir\<%arr.idx\> = getelementptr inbounds ir\<%A\>, ir\<%iv\>\l" +
       "  EMIT ir\<%l1\> = load ir\<%arr.idx\>\l" +
-      "  EMIT ir\<%c\> = icmp ir\<%l1\>, ir\<0\>\l" +
+      "  EMIT ir\<%c\> = icmp eq ir\<%l1\>, ir\<0\>\l" +
       "Successor(s): loop.latch\l"
     ]
     N4 -> N6 [ label=""]
@@ -316,7 +316,7 @@ compound=true
       "  EMIT ir\<%res\> = add ir\<%l1\>, ir\<10\>\l" +
       "  EMIT store ir\<%res\>, ir\<%arr.idx\>\l" +
       "  EMIT ir\<%iv.next\> = add ir\<%iv\>, ir\<1\>\l" +
-      "  EMIT ir\<%exitcond\> = icmp ir\<%iv.next\>, ir\<%N\>\l" +
+      "  EMIT ir\<%exitcond\> = icmp ne ir\<%iv.next\>, ir\<%N\>\l" +
       "  EMIT vp\<%3\> = not ir\<%exitcond\>\l" +
       "  EMIT vp\<%index.next\> = add nuw vp\<%2\>, vp\<%0\>\l" +
       "  EMIT branch-on-count vp\<%index.next\>, vp\<%1\>\l" +
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 3842ba235ead3..63776b78a2088 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1009,7 +1009,7 @@ TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
   SmallVector<VPValue *, 2> Args;
   Args.push_back(Op1);
   Args.push_back(Op2);
-  VPWidenRecipe WidenR(*AI, Args, VPIRMetadata(), DebugLoc());
+  VPWidenRecipe WidenR(*AI, Args);
 
   checkVPRecipeCastImpl<VPWidenRecipe, VPUser, VPIRMetadata>(&WidenR);
   delete AI;
@@ -1053,7 +1053,7 @@ TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
   Args.push_back(Op1);
   Args.push_back(Op2);
   Args.push_back(Op3);
-  VPWidenSelectRecipe WidenSelectR(*SelectI,
+  VPWidenSelectRecipe WidenSelectR(SelectI,
                                    make_range(Args.begin(), Args.end()));
 
   checkVPRecipeCastImpl<VPWidenSelectRecipe, VPUser, VPIRMetadata>(
@@ -1093,7 +1093,7 @@ TEST_F(VPRecipeTest, CastVPWidenCastRecipeToVPUser) {
   IntegerType *Int64 = IntegerType::get(C, 64);
   auto *Cast = CastInst::CreateZExtOrBitCast(PoisonValue::get(Int32), Int64);
   VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
-  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast, {});
+  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, Cast);
 
   checkVPRecipeCastImpl<VPWidenCastRecipe, VPUser, VPIRMetadata>(&Recipe);
   delete Cast;
@@ -1264,7 +1264,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     SmallVector<VPValue *, 2> Args;
     Args.push_back(Op1);
     Args.push_back(Op2);
-    VPWidenRecipe Recipe(*AI, Args, VPIRMetadata(), DebugLoc());
+    VPWidenRecipe Recipe(*AI, Args);
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1283,7 +1283,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     Args.push_back(Op1);
     Args.push_back(Op2);
     Args.push_back(Op3);
-    VPWidenSelectRecipe Recipe(*SelectI, make_range(Args.begin(), Args.end()));
+    VPWidenSelectRecipe Recipe(SelectI, make_range(Args.begin(), Args.end()));
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1412,7 +1412,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     Args.push_back(Op1);
     Args.push_back(Op2);
     Args.push_back(CalledFn);
-    VPWidenCallRecipe Recipe(Call, TheFn, Args);
+    VPWidenCallRecipe Recipe(Call, TheFn, Args, VPIRFlags(), VPIRMetadata());
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1468,8 +1468,7 @@ TEST_F(VPRecipeTest, dumpRecipeInPlan) {
   VPValue *ExtVPV2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
   Args.push_back(ExtVPV1);
   Args.push_back(ExtVPV2);
-  VPWidenRecipe *WidenR =
-      new VPWidenRecipe(*AI, Args, VPIRMetadata(), DebugLoc());
+  VPWidenRecipe *WidenR = new VPWidenRecipe(*AI, Args);
   VPBB1->appendRecipe(WidenR);
 
   {
diff --git a/llvm/utils/TableGen/Basic/TableGen.cpp b/llvm/utils/TableGen/Basic/TableGen.cpp
index b79ae93dab4f7..a655cbbc16096 100644
--- a/llvm/utils/TableGen/Basic/TableGen.cpp
+++ b/llvm/utils/TableGen/Basic/TableGen.cpp
@@ -73,7 +73,7 @@ int tblgen_main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv);
 
-  std::function<MultiFileTableGenMainFn> MainFn = nullptr;
+  MultiFileTableGenMainFn MainFn = nullptr;
   return TableGenMain(argv[0], MainFn);
 }
 
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index ef7b13e8940f8..3486a7a7fb08c 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -1878,6 +1878,8 @@ TableGenOutputFiles RegisterInfoEmitter::run(StringRef FilenamePrefix) {
   if (RegisterInfoDebug)
     debugDump(errs());
 
+  // The suffixes should be in sync with the tablegen function in
+  // llvm/cmake/modules/TableGen.cmake.
   return {Main,
           {{"Enums.inc", Enums},
            {"MCDesc.inc", MCDesc},
diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
index 14b00b04ccc18..420e58192b8fd 100644
--- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
@@ -166,4 +166,27 @@ def TosaAttachTarget : Pass<"tosa-attach-target", "ModuleOp"> {
   ];
 }
 
+def TosaNarrowI64ToI32Pass : Pass<"tosa-narrow-i64-to-i32", "func::FuncOp"> {
+  let summary = "Narrow I64 TOSA operations to I32";
+  let description = [{
+    This pass narrows TOSA operations with 64-bit integer tensor types to
+    32-bit integer tensor types. This can be useful for backends that do not
+    support the EXT-INT64 extension of TOSA.
+  }];
+
+  let options = [
+    Option<"aggressiveRewrite", "aggressive-rewrite", "bool", "false",
+      "If enabled, all TOSA operations are rewritten, regardless or whether the narrowing"
+      "is safe. This option may lead to data loss if not used carefully.">,
+    Option<"convertFunctionBoundaries", "convert-function-boundaries", "bool", "false",
+      "If enabled, the pass will convert function I/O types as well. Otherwise casts will"
+      "be inserted at the I/O boundaries.">
+  ];
+
+  let dependentDialects = [
+    "func::FuncDialect",
+    "tosa::TosaDialect",
+  ];
+}
+
 #endif // MLIR_DIALECT_TOSA_TRANSFORMS_PASSES
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index edc6565f44f00..b9a5e7d7f6eac 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1738,15 +1738,11 @@ LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite(
   auto sourceType = cast<VectorType>(op.getSource().getType());
   auto srcElemType = cast<FloatType>(sourceType.getElementType());
   unsigned bitWidth = srcElemType.getWidth();
-  int32_t scaleSel =
-      getScaleSel(blockSize, bitWidth, firstScaleLane, firstScaleByte);
 
   auto targetType = cast<VectorType>(op.getResult().getType());
   auto destElemType = cast<FloatType>(targetType.getElementType());
-  IntegerType i32 = rewriter.getI32Type();
-  Value castedScale =
-      LLVM::BitcastOp::create(rewriter, loc, i32, adaptor.getScale());
 
+  IntegerType i32 = rewriter.getI32Type();
   Value source = adaptor.getSource();
   Type llvmResultType = typeConverter->convertType(op.getResult().getType());
   Type packedType = nullptr;
@@ -1767,15 +1763,19 @@ LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite(
     return rewriter.notifyMatchFailure(op, "type conversion failed");
   }
 
-  Value castedSource =
-      LLVM::BitcastOp::create(rewriter, loc, packedType, source);
-
   std::optional<StringRef> maybeIntrinsic =
       scaledExtPacked816ToIntrinsic(srcElemType, destElemType);
   if (!maybeIntrinsic.has_value())
     return op.emitOpError(
         "no intrinsic matching packed scaled conversion on the given chipset");
 
+  int32_t scaleSel =
+      getScaleSel(blockSize, bitWidth, firstScaleLane, firstScaleByte);
+  Value castedScale =
+      LLVM::BitcastOp::create(rewriter, loc, i32, adaptor.getScale());
+  Value castedSource =
+      LLVM::BitcastOp::create(rewriter, loc, packedType, source);
+
   OperationState loweredOp(loc, *maybeIntrinsic);
   loweredOp.addTypes({llvmResultType});
   loweredOp.addOperands({castedSource, castedScale});
diff --git a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
index 41b338d6e7189..76e9ddd5b2304 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
@@ -12,6 +12,7 @@ add_mlir_dialect_library(MLIRTosaTransforms
   TosaTypeConverters.cpp
   TosaProfileCompliance.cpp
   TosaValidation.cpp
+  TosaNarrowI64ToI32.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Tosa/Transforms
@@ -21,6 +22,7 @@ add_mlir_dialect_library(MLIRTosaTransforms
 
   LINK_LIBS PUBLIC
   MLIRFuncDialect
+  MLIRFuncTransformOps
   MLIRPass
   MLIRTosaDialect
   MLIRTransformUtils
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaNarrowI64ToI32.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaNarrowI64ToI32.cpp
new file mode 100644
index 0000000000000..ddaf7d8a5e033
--- /dev/null
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaNarrowI64ToI32.cpp
@@ -0,0 +1,310 @@
+//===- TosaNarrowI64ToI32.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass narrows TOSA operations with 64-bit integer tensor types to
+// 32-bit integer tensor types. This can be useful for backends that do not
+// support the EXT-INT64 extension of TOSA. The pass has two options:
+//
+// - aggressive-rewrite - If enabled, all TOSA operations are rewritten,
+//     regardless or whether the narrowing is safe. This option may lead to
+//     data loss if not used carefully.
+// - convert-function-boundaries - If enabled, the pass will convert function
+//     I/O types as well. Otherwise casts will be inserted at the I/O
+//     boundaries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Tosa/Transforms/Passes.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace tosa {
+#define GEN_PASS_DEF_TOSANARROWI64TOI32PASS
+#include "mlir/Dialect/Tosa/Transforms/Passes.h.inc"
+} // namespace tosa
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::tosa;
+
+namespace {
+
+LogicalResult convertGenericOp(Operation *op, ValueRange operands,
+                               ConversionPatternRewriter &rewriter,
+                               const TypeConverter *typeConverter) {
+  // Convert types of results
+  SmallVector<Type, 4> newResults;
+  if (failed(typeConverter->convertTypes(op->getResultTypes(), newResults)))
+    return failure();
+
+  // Create a new operation state
+  OperationState state(op->getLoc(), op->getName().getStringRef(), operands,
+                       newResults, {}, op->getSuccessors());
+
+  for (const NamedAttribute &namedAttribute : op->getAttrs()) {
+    const Attribute attribute = namedAttribute.getValue();
+
+    // Convert integer attribute type
+    if (const auto intAttr = dyn_cast<IntegerAttr>(attribute)) {
+      const std::optional<Attribute> convertedAttribute =
+          typeConverter->convertTypeAttribute(intAttr.getType(), attribute);
+      state.addAttribute(namedAttribute.getName(), convertedAttribute.value());
+      continue;
+    }
+
+    if (const auto typeAttr = dyn_cast<TypeAttr>(attribute)) {
+      Type type = typeAttr.getValue();
+      const std::optional<Attribute> convertedAttribute =
+          typeConverter->convertTypeAttribute(type, attribute);
+      if (!convertedAttribute)
+        return rewriter.notifyMatchFailure(op,
+                                           "Failed to convert type attribute.");
+      state.addAttribute(namedAttribute.getName(), convertedAttribute.value());
+      continue;
+    }
+
+    if (const auto denseElementsAttr = dyn_cast<DenseElementsAttr>(attribute)) {
+      const Type type = denseElementsAttr.getType();
+      const std::optional<Attribute> convertedAttribute =
+          typeConverter->convertTypeAttribute(type, denseElementsAttr);
+      if (!convertedAttribute)
+        return rewriter.notifyMatchFailure(
+            op, "Failed to convert dense elements attribute.");
+      state.addAttribute(namedAttribute.getName(), convertedAttribute.value());
+      continue;
+    }
+
+    state.addAttribute(namedAttribute.getName(), attribute);
+  }
+
+  for (Region &region : op->getRegions()) {
+    Region *newRegion = state.addRegion();
+    rewriter.inlineRegionBefore(region, *newRegion, newRegion->begin());
+    if (failed(rewriter.convertRegionTypes(newRegion, *typeConverter)))
+      return failure();
+  }
+
+  Operation *newOp = rewriter.create(state);
+  rewriter.replaceOp(op, newOp->getResults());
+  return success();
+}
+
+// ===========================
+// Aggressive rewrite patterns
+// ===========================
+
+class ConvertGenericOp : public ConversionPattern {
+public:
+  ConvertGenericOp(TypeConverter &typeConverter, MLIRContext *context)
+      : ConversionPattern(typeConverter, MatchAnyOpTypeTag{}, 0, context) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    if (!isa<tosa::TosaOp>(op))
+      return rewriter.notifyMatchFailure(
+          op,
+          "Support for operations other than TOSA has not been implemented.");
+
+    return convertGenericOp(op, operands, rewriter, typeConverter);
+  }
+};
+
+// ===============================
+// Bounds checked rewrite patterns
+// ===============================
+
+class ConvertArgMaxOpWithBoundsChecking
+    : public OpConversionPattern<tosa::ArgMaxOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(tosa::ArgMaxOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    // Output type can be narrowed based on the size of the axis dimension
+    const int32_t axis = op.getAxis();
+    const auto inputType = dyn_cast<ShapedType>(adaptor.getInput().getType());
+    if (!inputType || !inputType.isStaticDim(axis))
+      return rewriter.notifyMatchFailure(
+          op, "Requires a static axis dimension for bounds checking.");
+    const int64_t axisDim = inputType.getDimSize(axis);
+    if (axisDim >= std::numeric_limits<int32_t>::max())
+      return rewriter.notifyMatchFailure(
+          op, "Axis dimension is too large to narrow safely.");
+
+    const Type resultType = op.getOutput().getType();
+    const Type newResultType = typeConverter->convertType(resultType);
+    rewriter.replaceOpWithNewOp<tosa::ArgMaxOp>(op, newResultType,
+                                                adaptor.getInput(), axis);
+    return success();
+  }
+};
+
+class ConvertCastOpWithBoundsChecking
+    : public OpConversionPattern<tosa::CastOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(tosa::CastOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    const auto inputType = dyn_cast<ShapedType>(adaptor.getInput().getType());
+    const auto resultType = dyn_cast<ShapedType>(op.getResult().getType());
+    if (!inputType || !resultType)
+      return failure();
+
+    const auto elementInputIntType =
+        dyn_cast<IntegerType>(inputType.getElementType());
+    const auto elementResultIntType =
+        dyn_cast<IntegerType>(resultType.getElementType());
+    if (elementInputIntType && elementResultIntType &&
+        elementInputIntType.getWidth() > elementResultIntType.getWidth())
+      return rewriter.notifyMatchFailure(
+          op, "Narrowing cast may lead to data loss.");
+
+    rewriter.replaceOpWithNewOp<tosa::CastOp>(
+        op, typeConverter->convertType(resultType), adaptor.getInput());
+    return success();
+  }
+};
+
+template <typename OpTy>
+class ConvertTypedOp : public OpConversionPattern<OpTy> {
+  using OpConversionPattern<OpTy>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    return convertGenericOp(op, adaptor.getOperands(), rewriter,
+                            this->getTypeConverter());
+  }
+};
+
+struct TosaNarrowI64ToI32
+    : public tosa::impl::TosaNarrowI64ToI32PassBase<TosaNarrowI64ToI32> {
+public:
+  explicit TosaNarrowI64ToI32() = default;
+  explicit TosaNarrowI64ToI32(const TosaNarrowI64ToI32PassOptions &options)
+      : TosaNarrowI64ToI32() {
+    this->aggressiveRewrite = options.aggressiveRewrite;
+    this->convertFunctionBoundaries = options.convertFunctionBoundaries;
+  }
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+
+    TypeConverter typeConverter;
+    typeConverter.addConversion([](Type type) -> Type { return type; });
+    typeConverter.addConversion([](IntegerType type) -> Type {
+      if (!type.isInteger(64))
+        return type;
+      return IntegerType::get(type.getContext(), 32);
+    });
+    typeConverter.addConversion(
+        [&typeConverter](RankedTensorType type) -> Type {
+          const Type elementType = type.getElementType();
+          if (!elementType.isInteger(64))
+            return type;
+          return RankedTensorType::get(type.getShape(),
+                                       typeConverter.convertType(elementType));
+        });
+
+    const auto materializeCast = [](OpBuilder &builder, Type resultType,
+                                    ValueRange inputs, Location loc) -> Value {
+      if (inputs.size() != 1)
+        return Value();
+      return tosa::CastOp::create(builder, loc, resultType, inputs.front());
+    };
+    typeConverter.addSourceMaterialization(materializeCast);
+    typeConverter.addTargetMaterialization(materializeCast);
+
+    typeConverter.addTypeAttributeConversion(
+        [](IntegerType type, IntegerAttr attribute) -> Attribute {
+          const APInt value = attribute.getValue().truncSSat(32);
+          return IntegerAttr::get(IntegerType::get(type.getContext(), 32),
+                                  value);
+        });
+    typeConverter.addTypeAttributeConversion(
+        [&typeConverter](ShapedType type,
+                         DenseIntElementsAttr attr) -> Attribute {
+          const ShapedType newType =
+              cast<ShapedType>(typeConverter.convertType(type));
+          const auto oldElementType = cast<IntegerType>(type.getElementType());
+          const auto newElementType =
+              cast<IntegerType>(newType.getElementType());
+          if (oldElementType.getWidth() == newElementType.getWidth())
+            return attr;
+
+          DenseElementsAttr mapped =
+              attr.mapValues(newElementType, [&](const APInt &v) {
+                return v.truncSSat(newElementType.getWidth());
+              });
+          return mapped;
+        });
+
+    ConversionTarget target(*context);
+    target.addDynamicallyLegalDialect<tosa::TosaDialect>(
+        [&typeConverter](Operation *op) {
+          return typeConverter.isLegal(op->getResultTypes()) &&
+                 typeConverter.isLegal(op->getOperandTypes());
+        });
+    if (convertFunctionBoundaries) {
+      target.addDynamicallyLegalOp<func::FuncOp>(
+          [&typeConverter](func::FuncOp op) {
+            return typeConverter.isSignatureLegal(op.getFunctionType()) &&
+                   typeConverter.isLegal(&op.getBody());
+          });
+      target.addDynamicallyLegalOp<func::ReturnOp>([](func::ReturnOp op) {
+        const FunctionType funcType =
+            op->getParentOfType<func::FuncOp>().getFunctionType();
+        return llvm::equal(op.getOperandTypes(), funcType.getResults());
+      });
+    } else {
+      target.addDynamicallyLegalOp<func::FuncOp>(
+          [](func::FuncOp op) { return true; });
+      target.addDynamicallyLegalOp<func::ReturnOp>(
+          [](func::ReturnOp op) { return true; });
+    }
+
+    RewritePatternSet patterns(context);
+    if (convertFunctionBoundaries) {
+      populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
+          patterns, typeConverter);
+      populateReturnOpTypeConversionPattern(patterns, typeConverter);
+    }
+    if (aggressiveRewrite) {
+      patterns.add<ConvertGenericOp>(typeConverter, context);
+    } else {
+      // Tensor
+      patterns.add<ConvertArgMaxOpWithBoundsChecking>(typeConverter, context);
+      // Data layout
+      patterns.add<ConvertTypedOp<tosa::ConcatOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::PadOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::ReshapeOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::ReverseOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::SliceOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::TileOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::TransposeOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::IdentityOp>>(typeConverter, context);
+      // Type conversion
+      patterns.add<ConvertCastOpWithBoundsChecking>(typeConverter, context);
+      // Controlflow
+      patterns.add<ConvertTypedOp<tosa::IfOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::WhileOp>>(typeConverter, context);
+    }
+
+    if (failed(
+            applyFullConversion(getOperation(), target, std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
diff --git a/mlir/python/mlir/dialects/gpu/__init__.py b/mlir/python/mlir/dialects/gpu/__init__.py
index 2fbcbb059f87a..d15643ca700e4 100644
--- a/mlir/python/mlir/dialects/gpu/__init__.py
+++ b/mlir/python/mlir/dialects/gpu/__init__.py
@@ -49,13 +49,13 @@ class GPUFuncOp(GPUFuncOp):
 
     FUNCTION_TYPE_ATTR_NAME = "function_type"
     SYM_NAME_ATTR_NAME = "sym_name"
-    ARGUMENT_ATTR_NAME = "arg_attrs"
-    RESULT_ATTR_NAME = "res_attrs"
 
     def __init__(
         self,
         function_type: Union[FunctionType, TypeAttr],
         sym_name: Optional[Union[str, StringAttr]] = None,
+        arg_attrs: Optional[Sequence[dict]] = None,
+        res_attrs: Optional[Sequence[dict]] = None,
         kernel: Optional[bool] = None,
         workgroup_attrib_attrs: Optional[Sequence[dict]] = None,
         private_attrib_attrs: Optional[Sequence[dict]] = None,
@@ -88,6 +88,8 @@ def __init__(
         )
         super().__init__(
             function_type,
+            arg_attrs=arg_attrs,
+            res_attrs=res_attrs,
             workgroup_attrib_attrs=workgroup_attrib_attrs,
             private_attrib_attrs=private_attrib_attrs,
             loc=loc,
diff --git a/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32-aggressive.mlir b/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32-aggressive.mlir
new file mode 100644
index 0000000000000..1a36177a37033
--- /dev/null
+++ b/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32-aggressive.mlir
@@ -0,0 +1,81 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics -tosa-narrow-i64-to-i32="aggressive-rewrite=1" %s | FileCheck %s --allow-unused-prefixes --check-prefixes=COMMON,DEFAULT
+// RUN: mlir-opt -split-input-file -verify-diagnostics -tosa-narrow-i64-to-i32="aggressive-rewrite=1 convert-function-boundaries=1" %s | FileCheck %s --allow-unused-prefixes --check-prefixes=COMMON,FUNCBOUND
+
+// CHECK-LABEL: test_i64_argmax_large_axis_dim
+func.func @test_i64_argmax_large_axis_dim(%arg0: tensor<1x513x513x2147483650xi8>) -> tensor<1x513x513xi64> {
+  // DEFAULT: tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x2147483650xi8>) -> tensor<1x513x513xi32>
+  %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x2147483650xi8>) -> tensor<1x513x513xi64>
+  return %0 : tensor<1x513x513xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_convert_input_parameters
+// DEFAULT: %[[IN:.*]]: tensor<1x513x513x3xi64>
+// FUNCBOUND: %[[IN:.*]]: tensor<1x513x513x3xi32>
+func.func @test_convert_input_parameters(%arg0: tensor<1x513x513x3xi64>) -> tensor<1x513x513x3xf32> {
+  // DEFAULT: %[[FUNC_BOUND_CAST:.*]] = tosa.cast %[[IN]] : (tensor<1x513x513x3xi64>) -> tensor<1x513x513x3xi32>
+  // DEFAULT: %[[CAST1:.*]] = tosa.cast %[[FUNC_BOUND_CAST]] : (tensor<1x513x513x3xi32>) -> tensor<1x513x513x3xi32>
+  // FUNCBOUND: %[[CAST1:.*]] = tosa.cast %[[IN]] : (tensor<1x513x513x3xi32>) -> tensor<1x513x513x3xi32>
+  %0 = tosa.cast %arg0 : (tensor<1x513x513x3xi64>) -> tensor<1x513x513x3xi32>
+
+  // COMMON: %[[CAST2:.*]] = tosa.cast %[[CAST1]] : (tensor<1x513x513x3xi32>) -> tensor<1x513x513x3xf32>
+  %1 = tosa.cast %0 : (tensor<1x513x513x3xi32>) -> tensor<1x513x513x3xf32>
+  return %1 : tensor<1x513x513x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_add
+// DEFAULT: %[[IN0:.*]]: tensor<13x21x1xi64>, %[[IN1:.*]]: tensor<13x21x3xi64>
+// FUNCBOUND: %[[IN0:.*]]: tensor<13x21x1xi32>, %[[IN1:.*]]: tensor<13x21x3xi32>
+func.func @test_add(%arg0: tensor<13x21x1xi64>, %arg1: tensor<13x21x3xi64>) -> tensor<13x21x3xi64> {
+  // DEFAULT-DAG: %[[FUNC_BOUND_CAST0:.*]] = tosa.cast %[[IN0]] : (tensor<13x21x1xi64>) -> tensor<13x21x1xi32>
+  // DEFAULT-DAG: %[[FUNC_BOUND_CAST1:.*]] = tosa.cast %[[IN1]] : (tensor<13x21x3xi64>) -> tensor<13x21x3xi32>
+  // DEFAULT: %[[ADD:.*]] = tosa.add %[[FUNC_BOUND_CAST0]], %[[FUNC_BOUND_CAST1]] : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
+  // DEFAULT: %[[CAST:.*]] = tosa.cast %[[ADD]] : (tensor<13x21x3xi32>) -> tensor<13x21x3xi64>
+  // DEFAULT: return %[[CAST]] : tensor<13x21x3xi64>
+  // FUNCBOUND: %[[ADD:.*]] = tosa.add %[[IN0]], %[[IN1]] : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
+  // FUNCBOUND: return %[[ADD]] : tensor<13x21x3xi32>
+  %0 = tosa.add %arg0, %arg1 : (tensor<13x21x1xi64>, tensor<13x21x3xi64>) -> tensor<13x21x3xi64>
+  return %0 : tensor<13x21x3xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_regions
+// DEFAULT: %[[IN0:.*]]: tensor<i64>, %[[IN1:.*]]: tensor<i64>
+func.func @test_regions(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<i1>) -> tensor<i64> {
+  // DEFAULT-DAG: %[[CAST0:.*]] = tosa.cast %[[IN0]] : (tensor<i64>) -> tensor<i32>
+  // DEFAULT-DAG: %[[CAST1:.*]] = tosa.cast %[[IN1]] : (tensor<i64>) -> tensor<i32>
+  // COMMON: %[[IF_RESULT:.*]] = tosa.cond_if
+  %0 = tosa.cond_if %arg2 : tensor<i1> -> (tensor<i64>) {
+    // DEFAULT: %[[ADD:.*]] = tosa.add %[[CAST0]], %[[CAST1]] : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    // FUNCBOUND: %[[ADD:.*]] = tosa.add %[[IN0]], %[[IN1]] : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %1 = tosa.add %arg0, %arg1 : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    // COMMON: tosa.yield %[[ADD]] : tensor<i32>
+    tosa.yield %1 : tensor<i64>
+  } else {
+    // DEFAULT: %[[SUB:.*]] = tosa.sub %[[CAST0]], %[[CAST1]] : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    // FUNCBOUND: %[[SUB:.*]] = tosa.sub %[[IN0]], %[[IN1]] : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %1 = tosa.sub %arg0, %arg1 : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    // COMMON: tosa.yield %[[SUB]] : tensor<i32>
+    tosa.yield %1 : tensor<i64>
+  }
+  // DEFAULT: %[[OUT:.*]] = tosa.cast %[[IF_RESULT]] : (tensor<i32>) -> tensor<i64>
+  // DEFAULT: return %[[OUT]] : tensor<i64>
+  // FUNCBOUND: return %[[IF_RESULT]] : tensor<i32>
+  return %0 : tensor<i64>
+}
+
+// -----
+
+// CHECK-LABEL: test_const
+func.func @test_const() -> tensor<2xi64> {
+  // COMMON: %[[CONST:.*]] = "tosa.const"() <{values = dense<[1, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  %0 = "tosa.const"() <{values = dense<[1, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
+  // DEFAULT: %[[OUT:.*]] = tosa.cast %[[CONST]] : (tensor<2xi32>) -> tensor<2xi64>
+  // DEFAULT: return %[[OUT]] : tensor<2xi64>
+  // FUNCBOUND: return %[[CONST]] : tensor<2xi32>
+  return %0 : tensor<2xi64>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32.mlir b/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32.mlir
new file mode 100644
index 0000000000000..a14483fcdd7b0
--- /dev/null
+++ b/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32.mlir
@@ -0,0 +1,162 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics -tosa-narrow-i64-to-i32="convert-function-boundaries=0" %s | FileCheck %s --allow-unused-prefixes --check-prefixes=COMMON,DEFAULT
+// RUN: mlir-opt -split-input-file -verify-diagnostics -tosa-narrow-i64-to-i32="convert-function-boundaries=1" %s | FileCheck %s --allow-unused-prefixes --check-prefixes=COMMON,FUNCBOUND
+
+// -----
+
+// CHECK-LABEL: test_i64_argmax
+func.func @test_i64_argmax(%arg0: tensor<1x513x513x19xi8>) -> tensor<1x513x513xi64> {
+  // COMMON: %[[ARGMAX:.*]] = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x19xi8>) -> tensor<1x513x513xi32>
+  %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x19xi8>) -> tensor<1x513x513xi64>
+
+  // DEFAULT: %[[CAST:.*]] = tosa.cast %[[ARGMAX]] : (tensor<1x513x513xi32>) -> tensor<1x513x513xi64>
+  // FUNCBOUND: return %[[ARGMAX]] : tensor<1x513x513xi32>
+  return %0 : tensor<1x513x513xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_i64_argmax_cast
+func.func @test_i64_argmax_cast(%arg0: tensor<1x513x513x19xi8>) -> tensor<1x513x513xf32> {
+  // COMMON: %[[ARGMAX:.*]] = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x19xi8>) -> tensor<1x513x513xi32>
+  %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x19xi8>) -> tensor<1x513x513xi64>
+  // COMMON: tosa.cast %[[ARGMAX]] : (tensor<1x513x513xi32>) -> tensor<1x513x513xf32>
+  %1 = tosa.cast %0 : (tensor<1x513x513xi64>) -> tensor<1x513x513xf32>
+  return %1 : tensor<1x513x513xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_i64_argmax_large_axis_dim
+func.func @test_i64_argmax_large_axis_dim(%arg0: tensor<1x513x513x2147483650xi8>) -> tensor<1x513x513xi64> {
+  // expected-error @+1 {{failed to legalize operation 'tosa.argmax'}}
+  %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x2147483650xi8>) -> tensor<1x513x513xi64>
+  return %0 : tensor<1x513x513xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_add
+func.func @test_add(%arg0: tensor<13x21x1xi64>, %arg1: tensor<13x21x3xi64>) -> tensor<13x21x3xi64> {
+  // expected-error @+1 {{failed to legalize operation 'tosa.add'}}
+  %0 = tosa.add %arg0, %arg1 : (tensor<13x21x1xi64>, tensor<13x21x3xi64>) -> tensor<13x21x3xi64>
+  return %0 : tensor<13x21x3xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_regions
+func.func @test_regions(%arg0: tensor<1x2xi32>, %arg1: tensor<1xi32>, %arg2: tensor<i1>) -> tensor<1xi32> {
+  // COMMON: %[[IF_RESULT:.*]] = tosa.cond_if %arg2 : tensor<i1> -> tensor<1xi32>
+  %0 = tosa.cond_if %arg2 : tensor<i1> -> tensor<1xi32> {
+    // COMMON: %[[ARGMAX:.*]] = tosa.argmax %arg0 {axis = 1 : i32} : (tensor<1x2xi32>) -> tensor<1xi32>
+    %1 = tosa.argmax %arg0 {axis = 1 : i32} : (tensor<1x2xi32>) -> tensor<1xi64>
+    // COMMON: %[[CAST:.*]] = tosa.cast %[[ARGMAX]] : (tensor<1xi32>) -> tensor<1xi32>
+    %2 = tosa.cast %1 : (tensor<1xi64>) -> tensor<1xi32>
+    // COMMON: tosa.yield %[[CAST]] : tensor<1xi32>
+    tosa.yield %2 : tensor<1xi32>
+  } else {
+    tosa.yield %arg1 : tensor<1xi32>
+  }
+  // COMMON: return %[[IF_RESULT]] : tensor<1xi32>
+  return %0 : tensor<1xi32>
+}
+
+// -----
+
+// CHECK-LABEL: test_concat
+func.func @test_concat(%arg0: tensor<13x21x3xi64>, %arg1: tensor<13x21x3xi64>) -> tensor<26x21x3xi64> {
+  // COMMON: tosa.concat %{{.*}}, %{{.*}} {axis = 0 : i32} : (tensor<13x21x3xi32>, tensor<13x21x3xi32>) -> tensor<26x21x3xi32>
+  %0 = tosa.concat %arg0, %arg1 {axis = 0 : i32} : (tensor<13x21x3xi64>, tensor<13x21x3xi64>) -> tensor<26x21x3xi64>
+  return %0 : tensor<26x21x3xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_pad
+func.func @test_pad(%arg0: tensor<13x21x3xi64>, %arg1: tensor<1xi64>) -> tensor<15x23x5xi64> {
+  %padding = tosa.const_shape {values = dense<1> : tensor<6xindex>} : () -> !tosa.shape<6>
+  // COMMON: tosa.pad %{{.*}}, %{{.*}}, %{{.*}} : (tensor<13x21x3xi32>, !tosa.shape<6>, tensor<1xi32>) -> tensor<15x23x5xi32>
+  %1 = tosa.pad %arg0, %padding, %arg1 : (tensor<13x21x3xi64>, !tosa.shape<6>, tensor<1xi64>) -> tensor<15x23x5xi64>
+  return %1 : tensor<15x23x5xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_reshape
+func.func @test_reshape(%arg0: tensor<13x21x3xi64>) -> tensor<1x819xi64> {
+  %1 = tosa.const_shape {values = dense<[1, 819]> : tensor<2xindex>} : () -> !tosa.shape<2>
+  // COMMON: tosa.reshape %{{.*}}, %{{.*}} : (tensor<13x21x3xi32>, !tosa.shape<2>) -> tensor<1x819xi32>
+  %0 = tosa.reshape %arg0, %1 : (tensor<13x21x3xi64>, !tosa.shape<2>) -> tensor<1x819xi64>
+  return %0 : tensor<1x819xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_reverse
+func.func @test_reverse(%arg0: tensor<13x21x3xi64>) -> tensor<13x21x3xi64> {
+  // COMMON: tosa.reverse %{{.*}} {axis = 0 : i32} : (tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
+  %0 = tosa.reverse %arg0 {axis = 0 : i32} : (tensor<13x21x3xi64>) -> tensor<13x21x3xi64>
+  return %0 : tensor<13x21x3xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_slice
+func.func @test_slice(%arg0: tensor<13x21x3xi64>) -> tensor<4x11x1xi64> {
+  %0 = tosa.const_shape {values = dense<[4, 11, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+  %1 = tosa.const_shape {values = dense<[6, 8, 0]> : tensor<3xindex>} : () -> !tosa.shape<3>
+  // COMMON: tosa.slice %{{.*}}, %{{.*}}, %{{.*}} : (tensor<13x21x3xi32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x11x1xi32>
+  %2 = tosa.slice %arg0, %0, %1 : (tensor<13x21x3xi64>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x11x1xi64>
+  return %2 : tensor<4x11x1xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_tile
+func.func @test_tile(%arg0: tensor<13x21x3xi64>) -> tensor<39x21x6xi64> {
+  %cst = tosa.const_shape { values = dense<[3, 1, 2]> : tensor<3xindex> } : () -> !tosa.shape<3>
+  // COMMON: tosa.tile %{{.*}}, %{{.*}} : (tensor<13x21x3xi32>, !tosa.shape<3>) -> tensor<39x21x6xi32>
+  %0 = tosa.tile %arg0, %cst: (tensor<13x21x3xi64>, !tosa.shape<3>) -> tensor<39x21x6xi64>
+  return %0 : tensor<39x21x6xi64>
+}
+
+// -----
+
+// CHECK-LABEL: transpose
+func.func @test_transpose(%arg0: tensor<13x21x3xi64>) -> tensor<3x13x21xi64> {
+  // COMMON: tosa.transpose %{{.*}} {perms = array<i32: 2, 0, 1>} : (tensor<13x21x3xi32>) -> tensor<3x13x21xi32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 2, 0, 1>} : (tensor<13x21x3xi64>) -> tensor<3x13x21xi64>
+  return %1 : tensor<3x13x21xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_transition_to_i64
+func.func @test_transition_to_i64(%arg0: tensor<1xi32>) -> tensor<1xi64> {
+  // COMMON: %[[CAST:.*]] = tosa.cast %arg0 : (tensor<1xi32>) -> tensor<1xi32>
+  %0 = tosa.cast %arg0 : (tensor<1xi32>) -> tensor<1xi64>
+  // COMMON: %[[IDENTITY1:.*]] = tosa.identity %[[CAST]] : (tensor<1xi32>) -> tensor<1xi32>
+  %1 = tosa.identity %0 : (tensor<1xi64>) -> tensor<1xi64>
+  // COMMON: %[[IDENTITY2:.*]] = tosa.identity %[[IDENTITY1]] : (tensor<1xi32>) -> tensor<1xi32>
+  %2 = tosa.identity %1 : (tensor<1xi64>) -> tensor<1xi64>
+  // DEFAULT: %[[OUT_CAST:.*]] = tosa.cast %[[IDENTITY2]] : (tensor<1xi32>) -> tensor<1xi64>
+  // DEFAULT: return %[[OUT_CAST]] : tensor<1xi64>
+  // FUNCBOUND: return %[[IDENTITY2]] : tensor<1xi32>
+  return %2 : tensor<1xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_transition_from_i64
+func.func @test_transition_from_i64(%arg0: tensor<1xi64>) -> tensor<1xi32> {
+  // DEFAULT: %[[CAST:.*]] = tosa.cast %arg0 : (tensor<1xi64>) -> tensor<1xi32>
+  // DEFAULT: %[[IDENTITY1:.*]] = tosa.identity %[[CAST]] : (tensor<1xi32>) -> tensor<1xi32>
+  // FUNCBOUND: %[[IDENTITY1:.*]] = tosa.identity %arg0 : (tensor<1xi32>) -> tensor<1xi32>
+  %0 = tosa.identity %arg0 : (tensor<1xi64>) -> tensor<1xi64>
+  // COMMON: %[[IDENTITY2:.*]] = tosa.identity %[[IDENTITY1]] : (tensor<1xi32>) -> tensor<1xi32>
+  %1 = tosa.identity %0 : (tensor<1xi64>) -> tensor<1xi64>
+  // COMMON: %[[OUT_CAST:.*]] = tosa.cast %[[IDENTITY2]] : (tensor<1xi32>) -> tensor<1xi32>
+  %2 = tosa.cast %1 : (tensor<1xi64>) -> tensor<1xi32>
+  // COMMON: return %[[OUT_CAST]] : tensor<1xi32>
+  return %2 : tensor<1xi32>
+}
diff --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py
index 3945c99c41091..1a009b7dfa30d 100644
--- a/mlir/test/python/dialects/gpu/dialect.py
+++ b/mlir/test/python/dialects/gpu/dialect.py
@@ -133,9 +133,10 @@ def builder(func: gpu.GPUFuncOp) -> None:
             ), func.known_grid_size
 
             func = gpu.GPUFuncOp(
-                func_type,
+                ir.FunctionType.get(inputs=[T.index()], results=[]),
                 sym_name="non_kernel_func",
                 body_builder=builder,
+                arg_attrs=[{"gpu.some_attribute": ir.StringAttr.get("foo")}],
             )
             assert not func.is_kernel
             assert func.known_block_size is None
@@ -154,10 +155,11 @@ def builder(func: gpu.GPUFuncOp) -> None:
     # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
     # CHECK:   gpu.return
     # CHECK: }
-    # CHECK: gpu.func @non_kernel_func() {
-    # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
-    # CHECK:   gpu.return
-    # CHECK: }
+    # CHECK:   gpu.func @non_kernel_func(
+    # CHECK-SAME:      %[[ARG0:.*]]: index {gpu.some_attribute = "foo"}) {
+    # CHECK:           %[[GLOBAL_ID_0:.*]] = gpu.global_id  x
+    # CHECK:           gpu.return
+    # CHECK:         }
 
 
 # CHECK-LABEL: testGPULaunchFuncOp
diff --git a/utils/bazel/MODULE.bazel b/utils/bazel/MODULE.bazel
new file mode 100644
index 0000000000000..d061487acf4d7
--- /dev/null
+++ b/utils/bazel/MODULE.bazel
@@ -0,0 +1,38 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""bzlmod configuration for llvm-project"""
+module(name = "llvm-project-overlay")
+
+bazel_dep(name = "apple_support", version = "1.24.1", repo_name = "build_bazel_apple_support")
+bazel_dep(name = "bazel_skylib", version = "1.8.2")
+bazel_dep(name = "platforms", version = "1.0.0")
+bazel_dep(name = "rules_android", version = "0.6.6")
+bazel_dep(name = "rules_cc", version = "0.2.11")
+bazel_dep(name = "rules_foreign_cc", version = "0.15.1")
+bazel_dep(name = "rules_python", version = "1.6.3")
+bazel_dep(name = "rules_shell", version = "0.6.1")
+
+llvm_repos_extension = use_extension(":extensions.bzl", "llvm_repos_extension")
+
+use_repo(
+    llvm_repos_extension,
+    "llvm-raw",
+    "llvm_zlib",
+    "vulkan_headers",
+    "vulkan_sdk_setup",
+    "gmp",
+    "mpfr",
+    "mpc",
+    "pfm",
+    "llvm_zstd",
+    "pybind11",
+    "pyyaml",
+    "robin_map",
+    "nanobind",
+)
+
+llvm_configure = use_repo_rule("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure")
+
+llvm_configure(name = "llvm-project")
diff --git a/utils/bazel/MODULE.bazel.lock b/utils/bazel/MODULE.bazel.lock
new file mode 100644
index 0000000000000..64de258401e91
--- /dev/null
+++ b/utils/bazel/MODULE.bazel.lock
@@ -0,0 +1,490 @@
+{
+  "lockFileVersion": 16,
+  "registryFileHashes": {
+    "https://bcr.bazel.build/bazel_registry.json": "8a28e4aff06ee60aed2a8c281907fb8bcbf3b753c91fb5a5c57da3215d5b3497",
+    "https://bcr.bazel.build/modules/abseil-cpp/20210324.2/MODULE.bazel": "7cd0312e064fde87c8d1cd79ba06c876bd23630c83466e9500321be55c96ace2",
+    "https://bcr.bazel.build/modules/abseil-cpp/20211102.0/MODULE.bazel": "70390338f7a5106231d20620712f7cccb659cd0e9d073d1991c038eb9fc57589",
+    "https://bcr.bazel.build/modules/abseil-cpp/20230125.1/MODULE.bazel": "89047429cb0207707b2dface14ba7f8df85273d484c2572755be4bab7ce9c3a0",
+    "https://bcr.bazel.build/modules/abseil-cpp/20230802.0.bcr.1/MODULE.bazel": "1c8cec495288dccd14fdae6e3f95f772c1c91857047a098fad772034264cc8cb",
+    "https://bcr.bazel.build/modules/abseil-cpp/20230802.0/MODULE.bazel": "d253ae36a8bd9ee3c5955384096ccb6baf16a1b1e93e858370da0a3b94f77c16",
+    "https://bcr.bazel.build/modules/abseil-cpp/20230802.1/MODULE.bazel": "fa92e2eb41a04df73cdabeec37107316f7e5272650f81d6cc096418fe647b915",
+    "https://bcr.bazel.build/modules/abseil-cpp/20240116.1/MODULE.bazel": "37bcdb4440fbb61df6a1c296ae01b327f19e9bb521f9b8e26ec854b6f97309ed",
+    "https://bcr.bazel.build/modules/abseil-cpp/20240116.2/MODULE.bazel": "73939767a4686cd9a520d16af5ab440071ed75cec1a876bf2fcfaf1f71987a16",
+    "https://bcr.bazel.build/modules/abseil-cpp/20250127.0/MODULE.bazel": "d1086e248cda6576862b4b3fe9ad76a214e08c189af5b42557a6e1888812c5d5",
+    "https://bcr.bazel.build/modules/abseil-cpp/20250127.0/source.json": "1b996859f840d8efc7c720efc61dcf2a84b1261cb3974cbbe9b6666ebf567775",
+    "https://bcr.bazel.build/modules/abseil-py/2.1.0/MODULE.bazel": "5ebe5bf853769c65707e5c28f216798f7a4b1042015e6a36e6d03094d94bec8a",
+    "https://bcr.bazel.build/modules/abseil-py/2.1.0/source.json": "0e8fc4f088ce07099c1cd6594c20c7ddbb48b4b3c0849b7d94ba94be88ff042b",
+    "https://bcr.bazel.build/modules/apple_support/1.11.1/MODULE.bazel": "1843d7cd8a58369a444fc6000e7304425fba600ff641592161d9f15b179fb896",
+    "https://bcr.bazel.build/modules/apple_support/1.15.1/MODULE.bazel": "a0556fefca0b1bb2de8567b8827518f94db6a6e7e7d632b4c48dc5f865bc7c85",
+    "https://bcr.bazel.build/modules/apple_support/1.24.1/MODULE.bazel": "f46e8ddad60aef170ee92b2f3d00ef66c147ceafea68b6877cb45bd91737f5f8",
+    "https://bcr.bazel.build/modules/apple_support/1.24.1/source.json": "cf725267cbacc5f028ef13bb77e7f2c2e0066923a4dab1025e4a0511b1ed258a",
+    "https://bcr.bazel.build/modules/bazel_features/1.1.0/MODULE.bazel": "cfd42ff3b815a5f39554d97182657f8c4b9719568eb7fded2b9135f084bf760b",
+    "https://bcr.bazel.build/modules/bazel_features/1.1.1/MODULE.bazel": "27b8c79ef57efe08efccbd9dd6ef70d61b4798320b8d3c134fd571f78963dbcd",
+    "https://bcr.bazel.build/modules/bazel_features/1.11.0/MODULE.bazel": "f9382337dd5a474c3b7d334c2f83e50b6eaedc284253334cf823044a26de03e8",
+    "https://bcr.bazel.build/modules/bazel_features/1.13.0/MODULE.bazel": "c14c33c7c3c730612bdbe14ebbb5e61936b6f11322ea95a6e91cd1ba962f94df",
+    "https://bcr.bazel.build/modules/bazel_features/1.15.0/MODULE.bazel": "d38ff6e517149dc509406aca0db3ad1efdd890a85e049585b7234d04238e2a4d",
+    "https://bcr.bazel.build/modules/bazel_features/1.17.0/MODULE.bazel": "039de32d21b816b47bd42c778e0454217e9c9caac4a3cf8e15c7231ee3ddee4d",
+    "https://bcr.bazel.build/modules/bazel_features/1.18.0/MODULE.bazel": "1be0ae2557ab3a72a57aeb31b29be347bcdc5d2b1eb1e70f39e3851a7e97041a",
+    "https://bcr.bazel.build/modules/bazel_features/1.19.0/MODULE.bazel": "59adcdf28230d220f0067b1f435b8537dd033bfff8db21335ef9217919c7fb58",
+    "https://bcr.bazel.build/modules/bazel_features/1.21.0/MODULE.bazel": "675642261665d8eea09989aa3b8afb5c37627f1be178382c320d1b46afba5e3b",
+    "https://bcr.bazel.build/modules/bazel_features/1.23.0/MODULE.bazel": "fd1ac84bc4e97a5a0816b7fd7d4d4f6d837b0047cf4cbd81652d616af3a6591a",
+    "https://bcr.bazel.build/modules/bazel_features/1.27.0/MODULE.bazel": "621eeee06c4458a9121d1f104efb80f39d34deff4984e778359c60eaf1a8cb65",
+    "https://bcr.bazel.build/modules/bazel_features/1.28.0/MODULE.bazel": "4b4200e6cbf8fa335b2c3f43e1d6ef3e240319c33d43d60cc0fbd4b87ece299d",
+    "https://bcr.bazel.build/modules/bazel_features/1.3.0/MODULE.bazel": "cdcafe83ec318cda34e02948e81d790aab8df7a929cec6f6969f13a489ccecd9",
+    "https://bcr.bazel.build/modules/bazel_features/1.30.0/MODULE.bazel": "a14b62d05969a293b80257e72e597c2da7f717e1e69fa8b339703ed6731bec87",
+    "https://bcr.bazel.build/modules/bazel_features/1.30.0/source.json": "b07e17f067fe4f69f90b03b36ef1e08fe0d1f3cac254c1241a1818773e3423bc",
+    "https://bcr.bazel.build/modules/bazel_features/1.4.1/MODULE.bazel": "e45b6bb2350aff3e442ae1111c555e27eac1d915e77775f6fdc4b351b758b5d7",
+    "https://bcr.bazel.build/modules/bazel_features/1.9.1/MODULE.bazel": "8f679097876a9b609ad1f60249c49d68bfab783dd9be012faf9d82547b14815a",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.0.3/MODULE.bazel": "bcb0fd896384802d1ad283b4e4eb4d718eebd8cb820b0a2c3a347fb971afd9d8",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.1.1/MODULE.bazel": "1add3e7d93ff2e6998f9e118022c84d163917d912f5afafb3058e3d2f1545b5e",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.2.0/MODULE.bazel": "44fe84260e454ed94ad326352a698422dbe372b21a1ac9f3eab76eb531223686",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.2.1/MODULE.bazel": "f35baf9da0efe45fa3da1696ae906eea3d615ad41e2e3def4aeb4e8bc0ef9a7a",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.3.0/MODULE.bazel": "20228b92868bf5cfc41bda7afc8a8ba2a543201851de39d990ec957b513579c5",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.4.1/MODULE.bazel": "a0dcb779424be33100dcae821e9e27e4f2901d9dfd5333efe5ac6a8d7ab75e1d",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.4.2/MODULE.bazel": "3bd40978e7a1fac911d5989e6b09d8f64921865a45822d8b09e815eaa726a651",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.5.0/MODULE.bazel": "32880f5e2945ce6a03d1fbd588e9198c0a959bb42297b2cfaf1685b7bc32e138",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.6.1/MODULE.bazel": "8fdee2dbaace6c252131c00e1de4b165dc65af02ea278476187765e1a617b917",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.7.0/MODULE.bazel": "0db596f4563de7938de764cc8deeabec291f55e8ec15299718b93c4423e9796d",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.7.1/MODULE.bazel": "3120d80c5861aa616222ec015332e5f8d3171e062e3e804a2a0253e1be26e59b",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.8.1/MODULE.bazel": "88ade7293becda963e0e3ea33e7d54d3425127e0a326e0d17da085a5f1f03ff6",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.8.2/MODULE.bazel": "69ad6927098316848b34a9142bcc975e018ba27f08c4ff403f50c1b6e646ca67",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.8.2/source.json": "34a3c8bcf233b835eb74be9d628899bb32999d3e0eadef1947a0a562a2b16ffb",
+    "https://bcr.bazel.build/modules/bazel_worker_api/0.0.1/MODULE.bazel": "02a13b77321773b2042e70ee5e4c5e099c8ddee4cf2da9cd420442c36938d4bd",
+    "https://bcr.bazel.build/modules/bazel_worker_api/0.0.4/MODULE.bazel": "460aa12d01231a80cce03c548287b433b321d205b0028ae596728c35e5ee442e",
+    "https://bcr.bazel.build/modules/bazel_worker_api/0.0.4/source.json": "d353c410d47a8b65d09fa98e83d57ebec257a2c2b9c6e42d6fda1cb25e5464a5",
+    "https://bcr.bazel.build/modules/bazel_worker_java/0.0.4/MODULE.bazel": "82494a01018bb7ef06d4a17ec4cd7a758721f10eb8b6c820a818e70d669500db",
+    "https://bcr.bazel.build/modules/bazel_worker_java/0.0.4/source.json": "a2d30458fd86cf022c2b6331e652526fa08e17573b2f5034a9dbcacdf9c2583c",
+    "https://bcr.bazel.build/modules/buildozer/7.1.2/MODULE.bazel": "2e8dd40ede9c454042645fd8d8d0cd1527966aa5c919de86661e62953cd73d84",
+    "https://bcr.bazel.build/modules/buildozer/7.1.2/source.json": "c9028a501d2db85793a6996205c8de120944f50a0d570438fcae0457a5f9d1f8",
+    "https://bcr.bazel.build/modules/gazelle/0.32.0/MODULE.bazel": "b499f58a5d0d3537f3cf5b76d8ada18242f64ec474d8391247438bf04f58c7b8",
+    "https://bcr.bazel.build/modules/gazelle/0.33.0/MODULE.bazel": "a13a0f279b462b784fb8dd52a4074526c4a2afe70e114c7d09066097a46b3350",
+    "https://bcr.bazel.build/modules/gazelle/0.34.0/MODULE.bazel": "abdd8ce4d70978933209db92e436deb3a8b737859e9354fb5fd11fb5c2004c8a",
+    "https://bcr.bazel.build/modules/gazelle/0.36.0/MODULE.bazel": "e375d5d6e9a6ca59b0cb38b0540bc9a05b6aa926d322f2de268ad267a2ee74c0",
+    "https://bcr.bazel.build/modules/gazelle/0.40.0/MODULE.bazel": "42ba5378ebe845fca43989a53186ab436d956db498acde790685fe0e8f9c6146",
+    "https://bcr.bazel.build/modules/gazelle/0.40.0/source.json": "1e5ef6e4d8b9b6836d93273c781e78ff829ea2e077afef7a57298040fa4f010a",
+    "https://bcr.bazel.build/modules/google_benchmark/1.8.2/MODULE.bazel": "a70cf1bba851000ba93b58ae2f6d76490a9feb74192e57ab8e8ff13c34ec50cb",
+    "https://bcr.bazel.build/modules/googletest/1.11.0/MODULE.bazel": "3a83f095183f66345ca86aa13c58b59f9f94a2f81999c093d4eeaa2d262d12f4",
+    "https://bcr.bazel.build/modules/googletest/1.14.0.bcr.1/MODULE.bazel": "22c31a561553727960057361aa33bf20fb2e98584bc4fec007906e27053f80c6",
+    "https://bcr.bazel.build/modules/googletest/1.14.0/MODULE.bazel": "cfbcbf3e6eac06ef9d85900f64424708cc08687d1b527f0ef65aa7517af8118f",
+    "https://bcr.bazel.build/modules/googletest/1.15.2/MODULE.bazel": "6de1edc1d26cafb0ea1a6ab3f4d4192d91a312fd2d360b63adaa213cd00b2108",
+    "https://bcr.bazel.build/modules/googletest/1.15.2/source.json": "dbdda654dcb3a0d7a8bc5d0ac5fc7e150b58c2a986025ae5bc634bb2cb61f470",
+    "https://bcr.bazel.build/modules/jsoncpp/1.9.5/MODULE.bazel": "31271aedc59e815656f5736f282bb7509a97c7ecb43e927ac1a37966e0578075",
+    "https://bcr.bazel.build/modules/jsoncpp/1.9.6/MODULE.bazel": "2f8d20d3b7d54143213c4dfc3d98225c42de7d666011528dc8fe91591e2e17b0",
+    "https://bcr.bazel.build/modules/jsoncpp/1.9.6/source.json": "a04756d367a2126c3541682864ecec52f92cdee80a35735a3cb249ce015ca000",
+    "https://bcr.bazel.build/modules/libpfm/4.11.0/MODULE.bazel": "45061ff025b301940f1e30d2c16bea596c25b176c8b6b3087e92615adbd52902",
+    "https://bcr.bazel.build/modules/nlohmann_json/3.6.1/MODULE.bazel": "6f7b417dcc794d9add9e556673ad25cb3ba835224290f4f848f8e2db1e1fca74",
+    "https://bcr.bazel.build/modules/nlohmann_json/3.6.1/source.json": "f448c6e8963fdfa7eb831457df83ad63d3d6355018f6574fb017e8169deb43a9",
+    "https://bcr.bazel.build/modules/platforms/0.0.10/MODULE.bazel": "8cb8efaf200bdeb2150d93e162c40f388529a25852b332cec879373771e48ed5",
+    "https://bcr.bazel.build/modules/platforms/0.0.11/MODULE.bazel": "0daefc49732e227caa8bfa834d65dc52e8cc18a2faf80df25e8caea151a9413f",
+    "https://bcr.bazel.build/modules/platforms/0.0.4/MODULE.bazel": "9b328e31ee156f53f3c416a64f8491f7eb731742655a47c9eec4703a71644aee",
+    "https://bcr.bazel.build/modules/platforms/0.0.5/MODULE.bazel": "5733b54ea419d5eaf7997054bb55f6a1d0b5ff8aedf0176fef9eea44f3acda37",
+    "https://bcr.bazel.build/modules/platforms/0.0.6/MODULE.bazel": "ad6eeef431dc52aefd2d77ed20a4b353f8ebf0f4ecdd26a807d2da5aa8cd0615",
+    "https://bcr.bazel.build/modules/platforms/0.0.7/MODULE.bazel": "72fd4a0ede9ee5c021f6a8dd92b503e089f46c227ba2813ff183b71616034814",
+    "https://bcr.bazel.build/modules/platforms/0.0.8/MODULE.bazel": "9f142c03e348f6d263719f5074b21ef3adf0b139ee4c5133e2aa35664da9eb2d",
+    "https://bcr.bazel.build/modules/platforms/0.0.9/MODULE.bazel": "4a87a60c927b56ddd67db50c89acaa62f4ce2a1d2149ccb63ffd871d5ce29ebc",
+    "https://bcr.bazel.build/modules/platforms/1.0.0/MODULE.bazel": "f05feb42b48f1b3c225e4ccf351f367be0371411a803198ec34a389fb22aa580",
+    "https://bcr.bazel.build/modules/platforms/1.0.0/source.json": "f4ff1fd412e0246fd38c82328eb209130ead81d62dcd5a9e40910f867f733d96",
+    "https://bcr.bazel.build/modules/protobuf/21.7/MODULE.bazel": "a5a29bb89544f9b97edce05642fac225a808b5b7be74038ea3640fae2f8e66a7",
+    "https://bcr.bazel.build/modules/protobuf/23.1/MODULE.bazel": "88b393b3eb4101d18129e5db51847cd40a5517a53e81216144a8c32dfeeca52a",
+    "https://bcr.bazel.build/modules/protobuf/24.4/MODULE.bazel": "7bc7ce5f2abf36b3b7b7c8218d3acdebb9426aeb35c2257c96445756f970eb12",
+    "https://bcr.bazel.build/modules/protobuf/27.0/MODULE.bazel": "7873b60be88844a0a1d8f80b9d5d20cfbd8495a689b8763e76c6372998d3f64c",
+    "https://bcr.bazel.build/modules/protobuf/27.1/MODULE.bazel": "703a7b614728bb06647f965264967a8ef1c39e09e8f167b3ca0bb1fd80449c0d",
+    "https://bcr.bazel.build/modules/protobuf/27.2/MODULE.bazel": "32450b50673882e4c8c3d10a83f3bc82161b213ed2f80d17e38bece8f165c295",
+    "https://bcr.bazel.build/modules/protobuf/29.0-rc2/MODULE.bazel": "6241d35983510143049943fc0d57937937122baf1b287862f9dc8590fc4c37df",
+    "https://bcr.bazel.build/modules/protobuf/29.0-rc3/MODULE.bazel": "33c2dfa286578573afc55a7acaea3cada4122b9631007c594bf0729f41c8de92",
+    "https://bcr.bazel.build/modules/protobuf/29.0/MODULE.bazel": "319dc8bf4c679ff87e71b1ccfb5a6e90a6dbc4693501d471f48662ac46d04e4e",
+    "https://bcr.bazel.build/modules/protobuf/3.19.0/MODULE.bazel": "6b5fbb433f760a99a22b18b6850ed5784ef0e9928a72668b66e4d7ccd47db9b0",
+    "https://bcr.bazel.build/modules/protobuf/3.19.2/MODULE.bazel": "532ffe5f2186b69fdde039efe6df13ba726ff338c6bc82275ad433013fa10573",
+    "https://bcr.bazel.build/modules/protobuf/3.19.6/MODULE.bazel": "9233edc5e1f2ee276a60de3eaa47ac4132302ef9643238f23128fea53ea12858",
+    "https://bcr.bazel.build/modules/protobuf/31.1/MODULE.bazel": "379a389bb330b7b8c1cdf331cc90bf3e13de5614799b3b52cdb7c6f389f6b38e",
+    "https://bcr.bazel.build/modules/protobuf/31.1/source.json": "25af5d0219da0c0fc4d1191a24ce438e6ca7f49d2e1a94f354efeba6ef10426f",
+    "https://bcr.bazel.build/modules/pybind11_bazel/2.11.1/MODULE.bazel": "88af1c246226d87e65be78ed49ecd1e6f5e98648558c14ce99176da041dc378e",
+    "https://bcr.bazel.build/modules/pybind11_bazel/2.12.0/MODULE.bazel": "e6f4c20442eaa7c90d7190d8dc539d0ab422f95c65a57cc59562170c58ae3d34",
+    "https://bcr.bazel.build/modules/pybind11_bazel/2.12.0/source.json": "6900fdc8a9e95866b8c0d4ad4aba4d4236317b5c1cd04c502df3f0d33afed680",
+    "https://bcr.bazel.build/modules/re2/2023-09-01/MODULE.bazel": "cb3d511531b16cfc78a225a9e2136007a48cf8a677e4264baeab57fe78a80206",
+    "https://bcr.bazel.build/modules/re2/2024-07-02.bcr.1/MODULE.bazel": "b4963dda9b31080be1905ef085ecd7dd6cd47c05c79b9cdf83ade83ab2ab271a",
+    "https://bcr.bazel.build/modules/re2/2024-07-02.bcr.1/source.json": "2ff292be6ef3340325ce8a045ecc326e92cbfab47c7cbab4bd85d28971b97ac4",
+    "https://bcr.bazel.build/modules/re2/2024-07-02/MODULE.bazel": "0eadc4395959969297cbcf31a249ff457f2f1d456228c67719480205aa306daa",
+    "https://bcr.bazel.build/modules/rules_android/0.1.1/MODULE.bazel": "48809ab0091b07ad0182defb787c4c5328bd3a278938415c00a7b69b50c4d3a8",
+    "https://bcr.bazel.build/modules/rules_android/0.6.6/MODULE.bazel": "b0fb569752aab65ab1a9db0a8f6cfaf5aa1754965e17e95dcf0e4d88e192a68d",
+    "https://bcr.bazel.build/modules/rules_android/0.6.6/source.json": "a9d8dc2d5a102dc03269a94acc886a4cab82cdcb9ccbc77b0f665d6d17a6ae09",
+    "https://bcr.bazel.build/modules/rules_apple/3.16.0/MODULE.bazel": "0d1caf0b8375942ce98ea944be754a18874041e4e0459401d925577624d3a54a",
+    "https://bcr.bazel.build/modules/rules_apple/3.16.0/source.json": "d8b5fe461272018cc07cfafce11fe369c7525330804c37eec5a82f84cd475366",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.1/MODULE.bazel": "cb2aa0747f84c6c3a78dad4e2049c154f08ab9d166b1273835a8174940365647",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.10/MODULE.bazel": "ec1705118f7eaedd6e118508d3d26deba2a4e76476ada7e0e3965211be012002",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.13/MODULE.bazel": "0e8529ed7b323dad0775ff924d2ae5af7640b23553dfcd4d34344c7e7a867191",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.14/MODULE.bazel": "5e343a3aac88b8d7af3b1b6d2093b55c347b8eefc2e7d1442f7a02dc8fea48ac",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.15/MODULE.bazel": "6704c35f7b4a72502ee81f61bf88706b54f06b3cbe5558ac17e2e14666cd5dcc",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.16/MODULE.bazel": "7661303b8fc1b4d7f532e54e9d6565771fea666fbdf839e0a86affcd02defe87",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.17/MODULE.bazel": "2ae1d8f4238ec67d7185d8861cb0a2cdf4bc608697c331b95bf990e69b62e64a",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.2/MODULE.bazel": "6915987c90970493ab97393024c156ea8fb9f3bea953b2f3ec05c34f19b5695c",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.6/MODULE.bazel": "abf360251023dfe3efcef65ab9d56beefa8394d4176dd29529750e1c57eaa33f",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.8/MODULE.bazel": "964c85c82cfeb6f3855e6a07054fdb159aced38e99a5eecf7bce9d53990afa3e",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.9/MODULE.bazel": "836e76439f354b89afe6a911a7adf59a6b2518fafb174483ad78a2a2fde7b1c5",
+    "https://bcr.bazel.build/modules/rules_cc/0.1.1/MODULE.bazel": "2f0222a6f229f0bf44cd711dc13c858dad98c62d52bd51d8fc3a764a83125513",
+    "https://bcr.bazel.build/modules/rules_cc/0.2.11/MODULE.bazel": "e94f24f065bf2191dba2dace951814378b66a94bb3bcc48077492fe0508059b5",
+    "https://bcr.bazel.build/modules/rules_cc/0.2.11/source.json": "4d555dc20c9c135b21b2e403cf0ce8393fb65711b2305979ce053df4ee3e78de",
+    "https://bcr.bazel.build/modules/rules_cc/0.2.8/MODULE.bazel": "f1df20f0bf22c28192a794f29b501ee2018fa37a3862a1a2132ae2940a23a642",
+    "https://bcr.bazel.build/modules/rules_foreign_cc/0.15.1/MODULE.bazel": "c2c60d26c79fda484acb95cdbec46e89d6b28b4845cb277160ce1e0c8622bb88",
+    "https://bcr.bazel.build/modules/rules_foreign_cc/0.15.1/source.json": "a161811a63ba8a859086da3b7ff3ad04f2e9c255d7727b41087103fc0eb22f55",
+    "https://bcr.bazel.build/modules/rules_foreign_cc/0.9.0/MODULE.bazel": "c9e8c682bf75b0e7c704166d79b599f93b72cfca5ad7477df596947891feeef6",
+    "https://bcr.bazel.build/modules/rules_fuzzing/0.5.2/MODULE.bazel": "40c97d1144356f52905566c55811f13b299453a14ac7769dfba2ac38192337a8",
+    "https://bcr.bazel.build/modules/rules_go/0.41.0/MODULE.bazel": "55861d8e8bb0e62cbd2896f60ff303f62ffcb0eddb74ecb0e5c0cbe36fc292c8",
+    "https://bcr.bazel.build/modules/rules_go/0.42.0/MODULE.bazel": "8cfa875b9aa8c6fce2b2e5925e73c1388173ea3c32a0db4d2b4804b453c14270",
+    "https://bcr.bazel.build/modules/rules_go/0.46.0/MODULE.bazel": "3477df8bdcc49e698b9d25f734c4f3a9f5931ff34ee48a2c662be168f5f2d3fd",
+    "https://bcr.bazel.build/modules/rules_go/0.50.1/MODULE.bazel": "b91a308dc5782bb0a8021ad4330c81fea5bda77f96b9e4c117b9b9c8f6665ee0",
+    "https://bcr.bazel.build/modules/rules_go/0.51.0-rc2/MODULE.bazel": "edfc3a9cea7bedb0eaaff37b0d7817c1a4bf72b3c615580b0ffcee6c52690fd4",
+    "https://bcr.bazel.build/modules/rules_go/0.51.0-rc2/source.json": "6b5cd0b3da2bd0e6949580851db990a04af0a285f072b9a0f059424457cd8cc9",
+    "https://bcr.bazel.build/modules/rules_java/4.0.0/MODULE.bazel": "5a78a7ae82cd1a33cef56dc578c7d2a46ed0dca12643ee45edbb8417899e6f74",
+    "https://bcr.bazel.build/modules/rules_java/5.3.5/MODULE.bazel": "a4ec4f2db570171e3e5eb753276ee4b389bae16b96207e9d3230895c99644b86",
+    "https://bcr.bazel.build/modules/rules_java/6.0.0/MODULE.bazel": "8a43b7df601a7ec1af61d79345c17b31ea1fedc6711fd4abfd013ea612978e39",
+    "https://bcr.bazel.build/modules/rules_java/6.3.0/MODULE.bazel": "a97c7678c19f236a956ad260d59c86e10a463badb7eb2eda787490f4c969b963",
+    "https://bcr.bazel.build/modules/rules_java/6.4.0/MODULE.bazel": "e986a9fe25aeaa84ac17ca093ef13a4637f6107375f64667a15999f77db6c8f6",
+    "https://bcr.bazel.build/modules/rules_java/6.5.2/MODULE.bazel": "1d440d262d0e08453fa0c4d8f699ba81609ed0e9a9a0f02cd10b3e7942e61e31",
+    "https://bcr.bazel.build/modules/rules_java/7.1.0/MODULE.bazel": "30d9135a2b6561c761bd67bd4990da591e6bdc128790ce3e7afd6a3558b2fb64",
+    "https://bcr.bazel.build/modules/rules_java/7.10.0/MODULE.bazel": "530c3beb3067e870561739f1144329a21c851ff771cd752a49e06e3dc9c2e71a",
+    "https://bcr.bazel.build/modules/rules_java/7.12.2/MODULE.bazel": "579c505165ee757a4280ef83cda0150eea193eed3bef50b1004ba88b99da6de6",
+    "https://bcr.bazel.build/modules/rules_java/7.2.0/MODULE.bazel": "06c0334c9be61e6cef2c8c84a7800cef502063269a5af25ceb100b192453d4ab",
+    "https://bcr.bazel.build/modules/rules_java/7.3.2/MODULE.bazel": "50dece891cfdf1741ea230d001aa9c14398062f2b7c066470accace78e412bc2",
+    "https://bcr.bazel.build/modules/rules_java/7.4.0/MODULE.bazel": "a592852f8a3dd539e82ee6542013bf2cadfc4c6946be8941e189d224500a8934",
+    "https://bcr.bazel.build/modules/rules_java/7.6.1/MODULE.bazel": "2f14b7e8a1aa2f67ae92bc69d1ec0fa8d9f827c4e17ff5e5f02e91caa3b2d0fe",
+    "https://bcr.bazel.build/modules/rules_java/8.13.0/MODULE.bazel": "0444ebf737d144cf2bb2ccb368e7f1cce735264285f2a3711785827c1686625e",
+    "https://bcr.bazel.build/modules/rules_java/8.13.0/source.json": "4605c0f676b87dd9d1fabd4d743b71f04d97503bd1a79aad53f87399fb5396de",
+    "https://bcr.bazel.build/modules/rules_java/8.3.2/MODULE.bazel": "7336d5511ad5af0b8615fdc7477535a2e4e723a357b6713af439fe8cf0195017",
+    "https://bcr.bazel.build/modules/rules_java/8.5.1/MODULE.bazel": "d8a9e38cc5228881f7055a6079f6f7821a073df3744d441978e7a43e20226939",
+    "https://bcr.bazel.build/modules/rules_java/8.6.0/MODULE.bazel": "9c064c434606d75a086f15ade5edb514308cccd1544c2b2a89bbac4310e41c71",
+    "https://bcr.bazel.build/modules/rules_java/8.6.1/MODULE.bazel": "f4808e2ab5b0197f094cabce9f4b006a27766beb6a9975931da07099560ca9c2",
+    "https://bcr.bazel.build/modules/rules_jvm_external/4.4.2/MODULE.bazel": "a56b85e418c83eb1839819f0b515c431010160383306d13ec21959ac412d2fe7",
+    "https://bcr.bazel.build/modules/rules_jvm_external/5.1/MODULE.bazel": "33f6f999e03183f7d088c9be518a63467dfd0be94a11d0055fe2d210f89aa909",
+    "https://bcr.bazel.build/modules/rules_jvm_external/5.2/MODULE.bazel": "d9351ba35217ad0de03816ef3ed63f89d411349353077348a45348b096615036",
+    "https://bcr.bazel.build/modules/rules_jvm_external/5.3/MODULE.bazel": "bf93870767689637164657731849fb887ad086739bd5d360d90007a581d5527d",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.1/MODULE.bazel": "75b5fec090dbd46cf9b7d8ea08cf84a0472d92ba3585b476f44c326eda8059c4",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.2/MODULE.bazel": "36a6e52487a855f33cb960724eb56547fa87e2c98a0474c3acad94339d7f8e99",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.3/MODULE.bazel": "c998e060b85f71e00de5ec552019347c8bca255062c990ac02d051bb80a38df0",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.6/MODULE.bazel": "153042249c7060536dc95b6bb9f9bb8063b8a0b0cb7acdb381bddbc2374aed55",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.7/MODULE.bazel": "e717beabc4d091ecb2c803c2d341b88590e9116b8bf7947915eeb33aab4f96dd",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.7/source.json": "5426f412d0a7fc6b611643376c7e4a82dec991491b9ce5cb1cfdd25fe2e92be4",
+    "https://bcr.bazel.build/modules/rules_kotlin/1.9.0/MODULE.bazel": "ef85697305025e5a61f395d4eaede272a5393cee479ace6686dba707de804d59",
+    "https://bcr.bazel.build/modules/rules_kotlin/1.9.5/MODULE.bazel": "043a16a572f610558ec2030db3ff0c9938574e7dd9f58bded1bb07c0192ef025",
+    "https://bcr.bazel.build/modules/rules_kotlin/1.9.6/MODULE.bazel": "d269a01a18ee74d0335450b10f62c9ed81f2321d7958a2934e44272fe82dcef3",
+    "https://bcr.bazel.build/modules/rules_kotlin/1.9.6/source.json": "2faa4794364282db7c06600b7e5e34867a564ae91bda7cae7c29c64e9466b7d5",
+    "https://bcr.bazel.build/modules/rules_license/0.0.3/MODULE.bazel": "627e9ab0247f7d1e05736b59dbb1b6871373de5ad31c3011880b4133cafd4bd0",
+    "https://bcr.bazel.build/modules/rules_license/0.0.7/MODULE.bazel": "088fbeb0b6a419005b89cf93fe62d9517c0a2b8bb56af3244af65ecfe37e7d5d",
+    "https://bcr.bazel.build/modules/rules_license/1.0.0/MODULE.bazel": "a7fda60eefdf3d8c827262ba499957e4df06f659330bbe6cdbdb975b768bb65c",
+    "https://bcr.bazel.build/modules/rules_license/1.0.0/source.json": "a52c89e54cc311196e478f8382df91c15f7a2bfdf4c6cd0e2675cc2ff0b56efb",
+    "https://bcr.bazel.build/modules/rules_pkg/0.7.0/MODULE.bazel": "df99f03fc7934a4737122518bb87e667e62d780b610910f0447665a7e2be62dc",
+    "https://bcr.bazel.build/modules/rules_pkg/1.0.1/MODULE.bazel": "5b1df97dbc29623bccdf2b0dcd0f5cb08e2f2c9050aab1092fd39a41e82686ff",
+    "https://bcr.bazel.build/modules/rules_pkg/1.0.1/source.json": "bd82e5d7b9ce2d31e380dd9f50c111d678c3bdaca190cb76b0e1c71b05e1ba8a",
+    "https://bcr.bazel.build/modules/rules_proto/4.0.0/MODULE.bazel": "a7a7b6ce9bee418c1a760b3d84f83a299ad6952f9903c67f19e4edd964894e06",
+    "https://bcr.bazel.build/modules/rules_proto/5.3.0-21.7/MODULE.bazel": "e8dff86b0971688790ae75528fe1813f71809b5afd57facb44dad9e8eca631b7",
+    "https://bcr.bazel.build/modules/rules_proto/6.0.0-rc1/MODULE.bazel": "1e5b502e2e1a9e825eef74476a5a1ee524a92297085015a052510b09a1a09483",
+    "https://bcr.bazel.build/modules/rules_proto/6.0.0/MODULE.bazel": "b531d7f09f58dce456cd61b4579ce8c86b38544da75184eadaf0a7cb7966453f",
+    "https://bcr.bazel.build/modules/rules_proto/6.0.2/MODULE.bazel": "ce916b775a62b90b61888052a416ccdda405212b6aaeb39522f7dc53431a5e73",
+    "https://bcr.bazel.build/modules/rules_proto/7.0.2/MODULE.bazel": "bf81793bd6d2ad89a37a40693e56c61b0ee30f7a7fdbaf3eabbf5f39de47dea2",
+    "https://bcr.bazel.build/modules/rules_proto/7.0.2/source.json": "1e5e7260ae32ef4f2b52fd1d0de8d03b606a44c91b694d2f1afb1d3b28a48ce1",
+    "https://bcr.bazel.build/modules/rules_python/0.10.2/MODULE.bazel": "cc82bc96f2997baa545ab3ce73f196d040ffb8756fd2d66125a530031cd90e5f",
+    "https://bcr.bazel.build/modules/rules_python/0.23.1/MODULE.bazel": "49ffccf0511cb8414de28321f5fcf2a31312b47c40cc21577144b7447f2bf300",
+    "https://bcr.bazel.build/modules/rules_python/0.25.0/MODULE.bazel": "72f1506841c920a1afec76975b35312410eea3aa7b63267436bfb1dd91d2d382",
+    "https://bcr.bazel.build/modules/rules_python/0.28.0/MODULE.bazel": "cba2573d870babc976664a912539b320cbaa7114cd3e8f053c720171cde331ed",
+    "https://bcr.bazel.build/modules/rules_python/0.31.0/MODULE.bazel": "93a43dc47ee570e6ec9f5779b2e64c1476a6ce921c48cc9a1678a91dd5f8fd58",
+    "https://bcr.bazel.build/modules/rules_python/0.33.2/MODULE.bazel": "3e036c4ad8d804a4dad897d333d8dce200d943df4827cb849840055be8d2e937",
+    "https://bcr.bazel.build/modules/rules_python/0.37.1/MODULE.bazel": "3faeb2d9fa0a81f8980643ee33f212308f4d93eea4b9ce6f36d0b742e71e9500",
+    "https://bcr.bazel.build/modules/rules_python/0.37.2/MODULE.bazel": "b5ffde91410745750b6c13be1c5dc4555ef5bc50562af4a89fd77807fdde626a",
+    "https://bcr.bazel.build/modules/rules_python/0.4.0/MODULE.bazel": "9208ee05fd48bf09ac60ed269791cf17fb343db56c8226a720fbb1cdf467166c",
+    "https://bcr.bazel.build/modules/rules_python/0.40.0/MODULE.bazel": "9d1a3cd88ed7d8e39583d9ffe56ae8a244f67783ae89b60caafc9f5cf318ada7",
+    "https://bcr.bazel.build/modules/rules_python/1.0.0/MODULE.bazel": "898a3d999c22caa585eb062b600f88654bf92efb204fa346fb55f6f8edffca43",
+    "https://bcr.bazel.build/modules/rules_python/1.2.0/MODULE.bazel": "5aeeb48b2a6c19d668b48adf2b8a2b209a6310c230db0ce77450f148a89846e4",
+    "https://bcr.bazel.build/modules/rules_python/1.6.3/MODULE.bazel": "a7b80c42cb3de5ee2a5fa1abc119684593704fcd2fec83165ebe615dec76574f",
+    "https://bcr.bazel.build/modules/rules_python/1.6.3/source.json": "f0be74977e5604a6526c8a416cda22985093ff7d5d380d41722d7e44015cc419",
+    "https://bcr.bazel.build/modules/rules_robolectric/4.14.1.2/MODULE.bazel": "d44fec647d0aeb67b9f3b980cf68ba634976f3ae7ccd6c07d790b59b87a4f251",
+    "https://bcr.bazel.build/modules/rules_robolectric/4.14.1.2/source.json": "37c10335f2361c337c5c1f34ed36d2da70534c23088062b33a8bdaab68aa9dea",
+    "https://bcr.bazel.build/modules/rules_shell/0.1.2/MODULE.bazel": "66e4ca3ce084b04af0b9ff05ff14cab4e5df7503973818bb91cbc6cda08d32fc",
+    "https://bcr.bazel.build/modules/rules_shell/0.2.0/MODULE.bazel": "fda8a652ab3c7d8fee214de05e7a9916d8b28082234e8d2c0094505c5268ed3c",
+    "https://bcr.bazel.build/modules/rules_shell/0.3.0/MODULE.bazel": "de4402cd12f4cc8fda2354fce179fdb068c0b9ca1ec2d2b17b3e21b24c1a937b",
+    "https://bcr.bazel.build/modules/rules_shell/0.6.1/MODULE.bazel": "72e76b0eea4e81611ef5452aa82b3da34caca0c8b7b5c0c9584338aa93bae26b",
+    "https://bcr.bazel.build/modules/rules_shell/0.6.1/source.json": "20ec05cd5e592055e214b2da8ccb283c7f2a421ea0dc2acbf1aa792e11c03d0c",
+    "https://bcr.bazel.build/modules/rules_swift/1.16.0/MODULE.bazel": "4a09f199545a60d09895e8281362b1ff3bb08bbde69c6fc87aff5b92fcc916ca",
+    "https://bcr.bazel.build/modules/rules_swift/2.1.1/MODULE.bazel": "494900a80f944fc7aa61500c2073d9729dff0b764f0e89b824eb746959bc1046",
+    "https://bcr.bazel.build/modules/rules_swift/2.1.1/source.json": "40fc69dfaac64deddbb75bd99cdac55f4427d9ca0afbe408576a65428427a186",
+    "https://bcr.bazel.build/modules/stardoc/0.5.1/MODULE.bazel": "1a05d92974d0c122f5ccf09291442580317cdd859f07a8655f1db9a60374f9f8",
+    "https://bcr.bazel.build/modules/stardoc/0.5.3/MODULE.bazel": "c7f6948dae6999bf0db32c1858ae345f112cacf98f174c7a8bb707e41b974f1c",
+    "https://bcr.bazel.build/modules/stardoc/0.5.6/MODULE.bazel": "c43dabc564990eeab55e25ed61c07a1aadafe9ece96a4efabb3f8bf9063b71ef",
+    "https://bcr.bazel.build/modules/stardoc/0.6.2/MODULE.bazel": "7060193196395f5dd668eda046ccbeacebfd98efc77fed418dbe2b82ffaa39fd",
+    "https://bcr.bazel.build/modules/stardoc/0.7.0/MODULE.bazel": "05e3d6d30c099b6770e97da986c53bd31844d7f13d41412480ea265ac9e8079c",
+    "https://bcr.bazel.build/modules/stardoc/0.7.1/MODULE.bazel": "3548faea4ee5dda5580f9af150e79d0f6aea934fc60c1cc50f4efdd9420759e7",
+    "https://bcr.bazel.build/modules/stardoc/0.7.2/MODULE.bazel": "fc152419aa2ea0f51c29583fab1e8c99ddefd5b3778421845606ee628629e0e5",
+    "https://bcr.bazel.build/modules/stardoc/0.7.2/source.json": "58b029e5e901d6802967754adf0a9056747e8176f017cfe3607c0851f4d42216",
+    "https://bcr.bazel.build/modules/swift_argument_parser/1.3.1.1/MODULE.bazel": "5e463fbfba7b1701d957555ed45097d7f984211330106ccd1352c6e0af0dcf91",
+    "https://bcr.bazel.build/modules/swift_argument_parser/1.3.1.1/source.json": "32bd87e5f4d7acc57c5b2ff7c325ae3061d5e242c0c4c214ae87e0f1c13e54cb",
+    "https://bcr.bazel.build/modules/upb/0.0.0-20220923-a547704/MODULE.bazel": "7298990c00040a0e2f121f6c32544bab27d4452f80d9ce51349b1a28f3005c43",
+    "https://bcr.bazel.build/modules/upb/0.0.0-20230516-61a97ef/MODULE.bazel": "c0df5e35ad55e264160417fd0875932ee3c9dda63d9fccace35ac62f45e1b6f9",
+    "https://bcr.bazel.build/modules/zlib/1.2.11/MODULE.bazel": "07b389abc85fdbca459b69e2ec656ae5622873af3f845e1c9d80fe179f3effa0",
+    "https://bcr.bazel.build/modules/zlib/1.2.12/MODULE.bazel": "3b1a8834ada2a883674be8cbd36ede1b6ec481477ada359cd2d3ddc562340b27",
+    "https://bcr.bazel.build/modules/zlib/1.3.1.bcr.3/MODULE.bazel": "af322bc08976524477c79d1e45e241b6efbeb918c497e8840b8ab116802dda79",
+    "https://bcr.bazel.build/modules/zlib/1.3.1.bcr.5/MODULE.bazel": "eec517b5bbe5492629466e11dae908d043364302283de25581e3eb944326c4ca",
+    "https://bcr.bazel.build/modules/zlib/1.3.1.bcr.5/source.json": "22bc55c47af97246cfc093d0acf683a7869377de362b5d1c552c2c2e16b7a806",
+    "https://bcr.bazel.build/modules/zlib/1.3.1/MODULE.bazel": "751c9940dcfe869f5f7274e1295422a34623555916eb98c174c1e945594bf198"
+  },
+  "selectedYankedVersions": {},
+  "moduleExtensions": {
+    "//:extensions.bzl%llvm_deps_extension": {
+      "general": {
+        "bzlTransitiveDigest": "LGeZ4Ibt22AGXloFt/bm3EsBB05m6aTG+WxfH8fJVB4=",
+        "usagesDigest": "dHBLC1g5cqg/flxcuZRJMp2heDoB4+0/NDd6MutLhGE=",
+        "recordedFileInputs": {},
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "llvm-raw": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:local.bzl%new_local_repository",
+            "attributes": {
+              "build_file_content": "# empty",
+              "path": "../../"
+            }
+          },
+          "llvm_zlib": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:zlib-ng.BUILD",
+              "sha256": "e36bb346c00472a1f9ff2a0a4643e590a254be6379da7cddd9daeb9a7f296731",
+              "strip_prefix": "zlib-ng-2.0.7",
+              "urls": [
+                "https://github.com/zlib-ng/zlib-ng/archive/refs/tags/2.0.7.zip"
+              ]
+            }
+          },
+          "vulkan_headers": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:vulkan_headers.BUILD",
+              "sha256": "19f491784ef0bc73caff877d11c96a48b946b5a1c805079d9006e3fbaa5c1895",
+              "strip_prefix": "Vulkan-Headers-9bd3f561bcee3f01d22912de10bb07ce4e23d378",
+              "urls": [
+                "https://github.com/KhronosGroup/Vulkan-Headers/archive/9bd3f561bcee3f01d22912de10bb07ce4e23d378.tar.gz"
+              ]
+            }
+          },
+          "vulkan_sdk_setup": {
+            "repoRuleId": "@@//:vulkan_sdk.bzl%vulkan_sdk_setup",
+            "attributes": {}
+          },
+          "gmp": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "urls": [
+                "https://gmplib.org/download/gmp/gmp-6.2.1.tar.xz",
+                "https://ftp.gnu.org/gnu/gmp/gmp-6.2.1.tar.xz"
+              ],
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:gmp.BUILD",
+              "sha256": "fd4829912cddd12f84181c3451cc752be224643e87fac497b69edddadc49b4f2",
+              "strip_prefix": "gmp-6.2.1"
+            }
+          },
+          "mpfr": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "urls": [
+                "https://www.mpfr.org/mpfr-current/mpfr-4.2.2.tar.gz"
+              ],
+              "sha256": "826cbb24610bd193f36fde172233fb8c009f3f5c2ad99f644d0dea2e16a20e42",
+              "strip_prefix": "mpfr-4.2.2",
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:mpfr.BUILD"
+            }
+          },
+          "mpc": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "urls": [
+                "https://ftp.gnu.org/gnu/mpc/mpc-1.3.1.tar.gz"
+              ],
+              "sha256": "ab642492f5cf882b74aa0cb730cd410a81edcdbec895183ce930e706c1c759b8",
+              "strip_prefix": "mpc-1.3.1",
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:mpc.BUILD"
+            }
+          },
+          "pfm": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "urls": [
+                "https://versaweb.dl.sourceforge.net/project/perfmon2/libpfm4/libpfm-4.13.0.tar.gz"
+              ],
+              "sha256": "d18b97764c755528c1051d376e33545d0eb60c6ebf85680436813fa5b04cc3d1",
+              "strip_prefix": "libpfm-4.13.0",
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:pfm.BUILD"
+            }
+          },
+          "llvm_zstd": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:zstd.BUILD",
+              "sha256": "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
+              "strip_prefix": "zstd-1.5.2",
+              "urls": [
+                "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz"
+              ]
+            }
+          },
+          "pybind11": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "url": "https://github.com/pybind/pybind11/archive/v2.10.3.zip",
+              "sha256": "201966a61dc826f1b1879a24a3317a1ec9214a918c8eb035be2f30c3e9cfbdcb",
+              "strip_prefix": "pybind11-2.10.3",
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:pybind.BUILD"
+            }
+          },
+          "pyyaml": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "url": "https://github.com/yaml/pyyaml/archive/refs/tags/5.1.zip",
+              "sha256": "f0a35d7f282a6d6b1a4f3f3965ef5c124e30ed27a0088efb97c0977268fd671f",
+              "strip_prefix": "pyyaml-5.1/lib3",
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:pyyaml.BUILD"
+            }
+          },
+          "robin_map": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:robin_map.BUILD",
+              "sha256": "a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236",
+              "strip_prefix": "robin-map-1.3.0",
+              "url": "https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz"
+            }
+          },
+          "nanobind": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:nanobind.BUILD",
+              "sha256": "8ce3667dce3e64fc06bfb9b778b6f48731482362fb89a43da156632266cd5a90",
+              "strip_prefix": "nanobind-2.9.2",
+              "url": "https://github.com/wjakob/nanobind/archive/refs/tags/v2.9.2.tar.gz"
+            }
+          }
+        },
+        "recordedRepoMappingEntries": [
+          [
+            "",
+            "bazel_tools",
+            "bazel_tools"
+          ]
+        ]
+      }
+    },
+    "@@rules_android+//rules/android_sdk_repository:rule.bzl%android_sdk_repository_extension": {
+      "general": {
+        "bzlTransitiveDigest": "NAy+0M15JNVEBb8Tny6t7j3lKqTnsAMjoBB6LJ+C370=",
+        "usagesDigest": "g9Ur6X6qhf9a8MmY9qXU/jFjkyk/aZVBegI0yVMF0z4=",
+        "recordedFileInputs": {},
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "androidsdk": {
+            "repoRuleId": "@@rules_android+//rules/android_sdk_repository:rule.bzl%_android_sdk_repository",
+            "attributes": {}
+          }
+        },
+        "recordedRepoMappingEntries": []
+      }
+    },
+    "@@rules_kotlin+//src/main/starlark/core/repositories:bzlmod_setup.bzl%rules_kotlin_extensions": {
+      "general": {
+        "bzlTransitiveDigest": "sFhcgPbDQehmbD1EOXzX4H1q/CD5df8zwG4kp4jbvr8=",
+        "usagesDigest": "QI2z8ZUR+mqtbwsf2fLqYdJAkPOHdOV+tF2yVAUgRzw=",
+        "recordedFileInputs": {},
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "com_github_jetbrains_kotlin_git": {
+            "repoRuleId": "@@rules_kotlin+//src/main/starlark/core/repositories:compiler.bzl%kotlin_compiler_git_repository",
+            "attributes": {
+              "urls": [
+                "https://github.com/JetBrains/kotlin/releases/download/v1.9.23/kotlin-compiler-1.9.23.zip"
+              ],
+              "sha256": "93137d3aab9afa9b27cb06a824c2324195c6b6f6179d8a8653f440f5bd58be88"
+            }
+          },
+          "com_github_jetbrains_kotlin": {
+            "repoRuleId": "@@rules_kotlin+//src/main/starlark/core/repositories:compiler.bzl%kotlin_capabilities_repository",
+            "attributes": {
+              "git_repository_name": "com_github_jetbrains_kotlin_git",
+              "compiler_version": "1.9.23"
+            }
+          },
+          "com_github_google_ksp": {
+            "repoRuleId": "@@rules_kotlin+//src/main/starlark/core/repositories:ksp.bzl%ksp_compiler_plugin_repository",
+            "attributes": {
+              "urls": [
+                "https://github.com/google/ksp/releases/download/1.9.23-1.0.20/artifacts.zip"
+              ],
+              "sha256": "ee0618755913ef7fd6511288a232e8fad24838b9af6ea73972a76e81053c8c2d",
+              "strip_version": "1.9.23-1.0.20"
+            }
+          },
+          "com_github_pinterest_ktlint": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_file",
+            "attributes": {
+              "sha256": "01b2e0ef893383a50dbeb13970fe7fa3be36ca3e83259e01649945b09d736985",
+              "urls": [
+                "https://github.com/pinterest/ktlint/releases/download/1.3.0/ktlint"
+              ],
+              "executable": true
+            }
+          },
+          "rules_android": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "sha256": "cd06d15dd8bb59926e4d65f9003bfc20f9da4b2519985c27e190cddc8b7a7806",
+              "strip_prefix": "rules_android-0.1.1",
+              "urls": [
+                "https://github.com/bazelbuild/rules_android/archive/v0.1.1.zip"
+              ]
+            }
+          }
+        },
+        "recordedRepoMappingEntries": [
+          [
+            "rules_kotlin+",
+            "bazel_tools",
+            "bazel_tools"
+          ]
+        ]
+      }
+    },
+    "@@rules_python+//python/uv:uv.bzl%uv": {
+      "general": {
+        "bzlTransitiveDigest": "477hS4MXeJ7LqPNLTqL+1ltraV5lqwOw3tEXWqnJRt8=",
+        "usagesDigest": "icnInV8HDGrRQf9x8RMfxWfBHgT3OgRlYovS/9POEJw=",
+        "recordedFileInputs": {},
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "uv": {
+            "repoRuleId": "@@rules_python+//python/uv/private:uv_toolchains_repo.bzl%uv_toolchains_repo",
+            "attributes": {
+              "toolchain_type": "'@@rules_python+//python/uv:uv_toolchain_type'",
+              "toolchain_names": [
+                "none"
+              ],
+              "toolchain_implementations": {
+                "none": "'@@rules_python+//python:none'"
+              },
+              "toolchain_compatible_with": {
+                "none": [
+                  "@platforms//:incompatible"
+                ]
+              },
+              "toolchain_target_settings": {}
+            }
+          }
+        },
+        "recordedRepoMappingEntries": [
+          [
+            "rules_python+",
+            "bazel_tools",
+            "bazel_tools"
+          ]
+        ]
+      }
+    }
+  }
+}
diff --git a/utils/bazel/extensions.bzl b/utils/bazel/extensions.bzl
new file mode 100644
index 0000000000000..b0d5871b722a7
--- /dev/null
+++ b/utils/bazel/extensions.bzl
@@ -0,0 +1,127 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""bzlmod extensions for llvm-project"""
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:local.bzl", "new_local_repository")
+load(":vulkan_sdk.bzl", "vulkan_sdk_setup")
+
+def _llvm_repos_extension_impl(module_ctx):
+    if any([m.is_root and m.name == "llvm-project-overlay" for m in module_ctx.modules]):
+        new_local_repository(
+            name = "llvm-raw",
+            build_file_content = "# empty",
+            path = "../../",
+        )
+
+    http_archive(
+        name = "llvm_zlib",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:zlib-ng.BUILD",
+        sha256 = "e36bb346c00472a1f9ff2a0a4643e590a254be6379da7cddd9daeb9a7f296731",
+        strip_prefix = "zlib-ng-2.0.7",
+        urls = [
+            "https://github.com/zlib-ng/zlib-ng/archive/refs/tags/2.0.7.zip",
+        ],
+    )
+
+    http_archive(
+        name = "vulkan_headers",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:vulkan_headers.BUILD",
+        sha256 = "19f491784ef0bc73caff877d11c96a48b946b5a1c805079d9006e3fbaa5c1895",
+        strip_prefix = "Vulkan-Headers-9bd3f561bcee3f01d22912de10bb07ce4e23d378",
+        urls = [
+            "https://github.com/KhronosGroup/Vulkan-Headers/archive/9bd3f561bcee3f01d22912de10bb07ce4e23d378.tar.gz",
+        ],
+    )
+
+    vulkan_sdk_setup(name = "vulkan_sdk_setup")
+
+    http_archive(
+        name = "gmp",
+        urls = [
+            "https://gmplib.org/download/gmp/gmp-6.2.1.tar.xz",
+            "https://ftp.gnu.org/gnu/gmp/gmp-6.2.1.tar.xz",
+        ],
+        build_file = "@llvm-raw//utils/bazel/third_party_build:gmp.BUILD",
+        sha256 = "fd4829912cddd12f84181c3451cc752be224643e87fac497b69edddadc49b4f2",
+        strip_prefix = "gmp-6.2.1",
+    )
+
+    http_archive(
+        name = "mpfr",
+        urls = [
+            "https://www.mpfr.org/mpfr-current/mpfr-4.2.2.tar.gz",
+        ],
+        sha256 = "826cbb24610bd193f36fde172233fb8c009f3f5c2ad99f644d0dea2e16a20e42",
+        strip_prefix = "mpfr-4.2.2",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:mpfr.BUILD",
+    )
+
+    http_archive(
+        name = "mpc",
+        urls = [
+            "https://ftp.gnu.org/gnu/mpc/mpc-1.3.1.tar.gz",
+        ],
+        sha256 = "ab642492f5cf882b74aa0cb730cd410a81edcdbec895183ce930e706c1c759b8",
+        strip_prefix = "mpc-1.3.1",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:mpc.BUILD",
+    )
+
+    http_archive(
+        name = "pfm",
+        urls = [
+            "https://versaweb.dl.sourceforge.net/project/perfmon2/libpfm4/libpfm-4.13.0.tar.gz",
+        ],
+        sha256 = "d18b97764c755528c1051d376e33545d0eb60c6ebf85680436813fa5b04cc3d1",
+        strip_prefix = "libpfm-4.13.0",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:pfm.BUILD",
+    )
+
+    http_archive(
+        name = "llvm_zstd",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:zstd.BUILD",
+        sha256 = "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
+        strip_prefix = "zstd-1.5.2",
+        urls = [
+            "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz",
+        ],
+    )
+
+    http_archive(
+        name = "pybind11",
+        url = "https://github.com/pybind/pybind11/archive/v2.10.3.zip",
+        sha256 = "201966a61dc826f1b1879a24a3317a1ec9214a918c8eb035be2f30c3e9cfbdcb",
+        strip_prefix = "pybind11-2.10.3",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:pybind.BUILD",
+    )
+
+    http_archive(
+        name = "pyyaml",
+        url = "https://github.com/yaml/pyyaml/archive/refs/tags/5.1.zip",
+        sha256 = "f0a35d7f282a6d6b1a4f3f3965ef5c124e30ed27a0088efb97c0977268fd671f",
+        strip_prefix = "pyyaml-5.1/lib3",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:pyyaml.BUILD",
+    )
+
+    # TODO: bump to robin-map-1.4.0
+    http_archive(
+        name = "robin_map",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:robin_map.BUILD",
+        sha256 = "a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236",
+        strip_prefix = "robin-map-1.3.0",
+        url = "https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz",
+    )
+
+    http_archive(
+        name = "nanobind",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:nanobind.BUILD",
+        sha256 = "8ce3667dce3e64fc06bfb9b778b6f48731482362fb89a43da156632266cd5a90",
+        strip_prefix = "nanobind-2.9.2",
+        url = "https://github.com/wjakob/nanobind/archive/refs/tags/v2.9.2.tar.gz",
+    )
+
+llvm_repos_extension = module_extension(
+    implementation = _llvm_repos_extension_impl,
+)
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index deb56dc0957e9..790709bdef05c 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1025,6 +1025,7 @@ cc_library(
 gentbl_cc_library(
     name = "sema_attr_gen",
     tbl_outs = {
+        "include/clang/Sema/AttrIsTypeDependent.inc": ["-gen-clang-attr-is-type-dependent"],
         "include/clang/Sema/AttrParsedAttrImpl.inc": ["-gen-clang-attr-parsed-attr-impl"],
         "include/clang/Sema/AttrParsedAttrKinds.inc": ["-gen-clang-attr-parsed-attr-kinds"],
         "include/clang/Sema/AttrSpellingListIndex.inc": ["-gen-clang-attr-spelling-index"],
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 635f77215b38f..ddad2f4f7611d 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -4100,6 +4100,7 @@ cc_library(
         ":DebugInfo",
         ":DebugInfoDWARF",
         ":JITLink",
+        ":Object",
         ":OrcJIT",
         ":OrcShared",
         ":Support",