diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst
index d5303418b859b..ff8b05ff263c1 100644
--- a/clang-tools-extra/docs/clang-tidy/Contributing.rst
+++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst
@@ -344,18 +344,20 @@ matching expressions to simplify your matcher.
   clang-query> let c1 cxxRecordDecl()
   clang-query> match c1
 
-Alternatively, pressing the tab key after a previous matcher's open parentheses would also 
-show which matchers can be chained with the previous matcher, though some matchers that work 
-may not be listed.
-
-Just like breaking up a huge function into smaller chunks with intention-revealing names 
-can help you understand a complex algorithm, breaking up a matcher into smaller matchers 
-with intention-revealing names can help you understand a complicated matcher.  
-
-Once you have a working clang-query matcher, the C++ API matchers will be the same or similar 
-to your interactively constructed matcher (there can be cases where they differ slightly). 
-You can use local variables to preserve your intention-revealing names that you applied 
-to nested matchers.
+Alternatively, pressing the tab key after a previous matcher's open parentheses 
+would also show which matchers can be chained with the previous matcher, 
+though some matchers that work may not be listed. Note that tab completion 
+does not currently work on Windows.
+
+Just like breaking up a huge function into smaller chunks with 
+intention-revealing names can help you understand a complex algorithm, breaking 
+up a matcher into smaller matchers with intention-revealing names can help 
+you understand a complicated matcher.  
+
+Once you have a working :program:`clang-query` matcher, the C++ API matchers 
+will be the same or similar to your interactively constructed matcher (there 
+can be cases where they differ slightly). You can use local variables to preserve 
+your intention-revealing names that you applied to nested matchers.
 
 Creating private matchers
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 250821a9f9c45..59ccdf1e15cd8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -298,6 +298,8 @@ Improvements to Clang's diagnostics
 
 - Clang now warns for u8 character literals used in C23 with ``-Wpre-c23-compat`` instead of ``-Wpre-c++17-compat``.
 
+- Clang now diagnoses cases where a dangling ``GSLOwner<GSLPointer>`` object is constructed, e.g. ``std::vector<string_view> v = {std::string()};`` (#GH100526).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def
index fa9c1ac0491c4..56e6179a664e2 100644
--- a/clang/include/clang/Basic/AArch64SVEACLETypes.def
+++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def
@@ -8,28 +8,48 @@
 //
 //  This file defines various SVE builtin types.  The macros are:
 //
-//    SVE_TYPE(Name, Id, SingletonId) - A builtin type that has not been
-//    covered by any other #define.  Defining this macro covers all
-//    the builtins.
+//    SVE_TYPE:
+//    - (Name, MangledName, Id, SingletonId)
+//    A builtin type that has not been covered by any other #define. Defining
+//    this macro covers all the builtin types.
 //
-//    SVE_VECTOR_TYPE(Name, Id, SingletonId, ElKind, ElBits, IsSigned, IsFP) -
-//    An SVE scalable vector.
+//    SVE_VECTOR_TYPE, SVE_PREDICATE_TYPE, SVE_OPAQUE_TYPE:
+//    - (Name, MangledName, Id, SingletonId)
+//    A builtin type that has not been covered by any other #define. Defining
+//    this macro covers the named subset of builtin types.
 //
-//    SVE_PREDICATE_TYPE(Name, Id, SingletonId, ElKind) - An SVE scalable
-//    predicate.
+//    SVE_VECTOR_TYPE_INT
+//    - (Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned)
+//    Defining the macro covers the integer vector types.
+//
+//    SVE_VECTOR_TYPE_FLOAT, SVE_VECTOR_TYPE_BFLOAT:
+//    - (Name, MangledName, Id, SingletonId, NumEls, ElBits, NF)
+//    Defining the macro covers the floating point vector types.
+//
+//    SVE_PREDICATE_TYPE_ALL:
+//    - (Name, MangledName, Id, SingletonId, NumEls, NF)
+//    Defining the macro covers the boolean vector types.
 //
 // where:
 //
 //  - Name is the name of the builtin type.
 //
+//  - MangledName is the mangled name of the builtin type.
+//
 //  - BuiltinType::Id is the enumerator defining the type.
 //
 //  - Context.SingletonId is the global singleton of this type.
 //
 //  - ElKind enumerates the type of the elements.
 //
+//  - NumEls enumerates the number of the elements.
+//
 //  - ElBits is the size of one element in bits.
 //
+//  - NF enumerates the number of sub-vectors.
+//    TODO: Tuple types are represented as a concatination of "NumEls x ElBits"
+//    vectors.  This will be changed to become a struct containing NF vectors.
+//
 //  - IsSigned is true for vectors of signed integer elements and
 //    for vectors of floating-point elements.
 //
@@ -39,102 +59,134 @@
 //===----------------------------------------------------------------------===//
 
 #ifndef SVE_VECTOR_TYPE
-#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits,    \
-                        IsSigned, IsFP, IsBF)                                  \
+#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \
   SVE_TYPE(Name, Id, SingletonId)
 #endif
 
+#ifndef SVE_VECTOR_TYPE_DETAILS
+#define SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned, IsFP, IsBF) \
+  SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId)
+#endif
+
+#ifndef SVE_VECTOR_TYPE_BFLOAT
+#define SVE_VECTOR_TYPE_BFLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
+  SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, true)
+#endif
+
+#ifndef SVE_VECTOR_TYPE_FLOAT
+#define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
+  SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, true, false)
+#endif
+
+#ifndef SVE_VECTOR_TYPE_INT
+#define SVE_VECTOR_TYPE_INT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned) \
+  SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned, false, false)
+#endif
+
 #ifndef SVE_PREDICATE_TYPE
-#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls)         \
+#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId) \
   SVE_TYPE(Name, Id, SingletonId)
 #endif
 
+#ifndef SVE_PREDICATE_TYPE_ALL
+#define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
+  SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)
+#endif
+
 #ifndef SVE_OPAQUE_TYPE
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)                    \
+#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) \
   SVE_TYPE(Name, Id, SingletonId)
 #endif
 
 //===- Vector point types -----------------------------------------------===//
 
+SVE_VECTOR_TYPE_INT("__SVInt8_t",  "__SVInt8_t",  SveInt8,  SveInt8Ty, 16,  8, 1, true)
+SVE_VECTOR_TYPE_INT("__SVInt16_t", "__SVInt16_t", SveInt16, SveInt16Ty, 8, 16, 1, true)
+SVE_VECTOR_TYPE_INT("__SVInt32_t", "__SVInt32_t", SveInt32, SveInt32Ty, 4, 32, 1, true)
+SVE_VECTOR_TYPE_INT("__SVInt64_t", "__SVInt64_t", SveInt64, SveInt64Ty, 2, 64, 1, true)
 
-SVE_VECTOR_TYPE("__SVInt8_t", "__SVInt8_t",  SveInt8, SveInt8Ty, 16, 8, true, false, false)
-SVE_VECTOR_TYPE("__SVInt16_t", "__SVInt16_t", SveInt16, SveInt16Ty, 8, 16, true, false, false)
-SVE_VECTOR_TYPE("__SVInt32_t", "__SVInt32_t", SveInt32, SveInt32Ty, 4, 32, true, false, false)
-SVE_VECTOR_TYPE("__SVInt64_t", "__SVInt64_t", SveInt64, SveInt64Ty, 2, 64, true, false, false)
-
-SVE_VECTOR_TYPE("__SVUint8_t", "__SVUint8_t",  SveUint8, SveUint8Ty, 16, 8, false, false, false)
-SVE_VECTOR_TYPE("__SVUint16_t", "__SVUint16_t", SveUint16, SveUint16Ty, 8, 16, false, false, false)
-SVE_VECTOR_TYPE("__SVUint32_t", "__SVUint32_t", SveUint32, SveUint32Ty, 4, 32, false, false, false)
-SVE_VECTOR_TYPE("__SVUint64_t", "__SVUint64_t", SveUint64, SveUint64Ty, 2, 64, false, false, false)
+SVE_VECTOR_TYPE_INT("__SVUint8_t",  "__SVUint8_t",  SveUint8,  SveUint8Ty, 16, 8, 1, false)
+SVE_VECTOR_TYPE_INT("__SVUint16_t", "__SVUint16_t", SveUint16, SveUint16Ty, 8, 16, 1, false)
+SVE_VECTOR_TYPE_INT("__SVUint32_t", "__SVUint32_t", SveUint32, SveUint32Ty, 4, 32, 1, false)
+SVE_VECTOR_TYPE_INT("__SVUint64_t", "__SVUint64_t", SveUint64, SveUint64Ty, 2, 64, 1, false)
 
-SVE_VECTOR_TYPE("__SVFloat16_t", "__SVFloat16_t", SveFloat16, SveFloat16Ty, 8, 16, true, true, false)
-SVE_VECTOR_TYPE("__SVFloat32_t", "__SVFloat32_t", SveFloat32, SveFloat32Ty, 4, 32, true, true, false)
-SVE_VECTOR_TYPE("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty, 2, 64, true, true, false)
+SVE_VECTOR_TYPE_FLOAT("__SVFloat16_t", "__SVFloat16_t", SveFloat16, SveFloat16Ty, 8, 16, 1)
+SVE_VECTOR_TYPE_FLOAT("__SVFloat32_t", "__SVFloat32_t", SveFloat32, SveFloat32Ty, 4, 32, 1)
+SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty, 2, 64, 1)
 
-SVE_VECTOR_TYPE("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, true, false, true)
+SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1)
 
 //
 // x2
 //
-SVE_VECTOR_TYPE("__clang_svint8x2_t", "svint8x2_t",  SveInt8x2, SveInt8x2Ty, 32, 8, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint16x2_t", "svint16x2_t", SveInt16x2, SveInt16x2Ty, 16, 16, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint32x2_t", "svint32x2_t", SveInt32x2, SveInt32x2Ty, 8, 32, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint64x2_t", "svint64x2_t", SveInt64x2, SveInt64x2Ty, 4, 64, true, false, false)
 
-SVE_VECTOR_TYPE("__clang_svuint8x2_t", "svuint8x2_t",  SveUint8x2, SveUint8x2Ty, 32, 8, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint16x2_t", "svuint16x2_t", SveUint16x2, SveUint16x2Ty, 16, 16, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint32x2_t", "svuint32x2_t", SveUint32x2, SveUint32x2Ty, 8, 32, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint64x2_t", "svuint64x2_t", SveUint64x2, SveUint64x2Ty, 4, 64, false, false, false)
+SVE_VECTOR_TYPE_INT("__clang_svint8x2_t",  "svint8x2_t",  SveInt8x2,  SveInt8x2Ty, 16, 8, 2, true)
+SVE_VECTOR_TYPE_INT("__clang_svint16x2_t", "svint16x2_t", SveInt16x2, SveInt16x2Ty, 8, 16, 2, true)
+SVE_VECTOR_TYPE_INT("__clang_svint32x2_t", "svint32x2_t", SveInt32x2, SveInt32x2Ty, 4, 32, 2, true)
+SVE_VECTOR_TYPE_INT("__clang_svint64x2_t", "svint64x2_t", SveInt64x2, SveInt64x2Ty, 2, 64, 2, true)
 
-SVE_VECTOR_TYPE("__clang_svfloat16x2_t", "svfloat16x2_t", SveFloat16x2, SveFloat16x2Ty, 16, 16, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat32x2_t", "svfloat32x2_t", SveFloat32x2, SveFloat32x2Ty, 8, 32, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, SveFloat64x2Ty, 4, 64, true, true, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint8x2_t",  "svuint8x2_t",  SveUint8x2,  SveUint8x2Ty, 16 , 8, 2, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint16x2_t", "svuint16x2_t", SveUint16x2, SveUint16x2Ty, 8, 16, 2, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint32x2_t", "svuint32x2_t", SveUint32x2, SveUint32x2Ty, 4, 32, 2, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint64x2_t", "svuint64x2_t", SveUint64x2, SveUint64x2Ty, 2, 64, 2, false)
+
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x2_t", "svfloat16x2_t", SveFloat16x2, SveFloat16x2Ty, 8, 16, 2)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x2_t", "svfloat32x2_t", SveFloat32x2, SveFloat32x2Ty, 4, 32, 2)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, SveFloat64x2Ty, 2, 64, 2)
+
+SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2)
 
-SVE_VECTOR_TYPE("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 16, 16, true, false, true)
 //
 // x3
 //
-SVE_VECTOR_TYPE("__clang_svint8x3_t", "svint8x3_t",  SveInt8x3, SveInt8x3Ty, 48, 8, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint16x3_t", "svint16x3_t", SveInt16x3, SveInt16x3Ty, 24, 16, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint32x3_t", "svint32x3_t", SveInt32x3, SveInt32x3Ty, 12, 32, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint64x3_t", "svint64x3_t", SveInt64x3, SveInt64x3Ty, 6, 64, true, false, false)
 
-SVE_VECTOR_TYPE("__clang_svuint8x3_t", "svuint8x3_t",  SveUint8x3, SveUint8x3Ty, 48, 8, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint16x3_t", "svuint16x3_t", SveUint16x3, SveUint16x3Ty, 24, 16, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint32x3_t", "svuint32x3_t", SveUint32x3, SveUint32x3Ty, 12, 32, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint64x3_t", "svuint64x3_t", SveUint64x3, SveUint64x3Ty, 6, 64, false, false, false)
+SVE_VECTOR_TYPE_INT("__clang_svint8x3_t",  "svint8x3_t",  SveInt8x3,  SveInt8x3Ty, 16,  8, 3, true)
+SVE_VECTOR_TYPE_INT("__clang_svint16x3_t", "svint16x3_t", SveInt16x3, SveInt16x3Ty, 8, 16, 3, true)
+SVE_VECTOR_TYPE_INT("__clang_svint32x3_t", "svint32x3_t", SveInt32x3, SveInt32x3Ty, 4, 32, 3, true)
+SVE_VECTOR_TYPE_INT("__clang_svint64x3_t", "svint64x3_t", SveInt64x3, SveInt64x3Ty, 2, 64, 3, true)
+
+SVE_VECTOR_TYPE_INT("__clang_svuint8x3_t",  "svuint8x3_t",  SveUint8x3,  SveUint8x3Ty, 16,  8, 3, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint16x3_t", "svuint16x3_t", SveUint16x3, SveUint16x3Ty, 8, 16, 3, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint32x3_t", "svuint32x3_t", SveUint32x3, SveUint32x3Ty, 4, 32, 3, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint64x3_t", "svuint64x3_t", SveUint64x3, SveUint64x3Ty, 2, 64, 3, false)
 
-SVE_VECTOR_TYPE("__clang_svfloat16x3_t", "svfloat16x3_t", SveFloat16x3, SveFloat16x3Ty, 24, 16, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat32x3_t", "svfloat32x3_t", SveFloat32x3, SveFloat32x3Ty, 12, 32, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, SveFloat64x3Ty, 6, 64, true, true, false)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x3_t", "svfloat16x3_t", SveFloat16x3, SveFloat16x3Ty, 8, 16, 3)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x3_t", "svfloat32x3_t", SveFloat32x3, SveFloat32x3Ty, 4, 32, 3)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, SveFloat64x3Ty, 2, 64, 3)
+
+SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3)
 
-SVE_VECTOR_TYPE("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 24, 16, true, false, true)
 //
 // x4
 //
-SVE_VECTOR_TYPE("__clang_svint8x4_t", "svint8x4_t",  SveInt8x4, SveInt8x4Ty, 64, 8, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint16x4_t", "svint16x4_t", SveInt16x4, SveInt16x4Ty, 32, 16, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint32x4_t", "svint32x4_t", SveInt32x4, SveInt32x4Ty, 16, 32, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint64x4_t", "svint64x4_t", SveInt64x4, SveInt64x4Ty, 8, 64, true, false, false)
 
-SVE_VECTOR_TYPE("__clang_svuint8x4_t", "svuint8x4_t",  SveUint8x4, SveUint8x4Ty, 64, 8, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint16x4_t", "svuint16x4_t", SveUint16x4, SveUint16x4Ty, 32, 16, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint32x4_t", "svuint32x4_t", SveUint32x4, SveUint32x4Ty, 16, 32, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint64x4_t", "svuint64x4_t", SveUint64x4, SveUint64x4Ty, 8, 64, false, false, false)
+SVE_VECTOR_TYPE_INT("__clang_svint8x4_t",  "svint8x4_t",  SveInt8x4,  SveInt8x4Ty, 16,  8, 4, true)
+SVE_VECTOR_TYPE_INT("__clang_svint16x4_t", "svint16x4_t", SveInt16x4, SveInt16x4Ty, 8, 16, 4, true)
+SVE_VECTOR_TYPE_INT("__clang_svint32x4_t", "svint32x4_t", SveInt32x4, SveInt32x4Ty, 4, 32, 4, true)
+SVE_VECTOR_TYPE_INT("__clang_svint64x4_t", "svint64x4_t", SveInt64x4, SveInt64x4Ty, 2, 64, 4, true)
+
+SVE_VECTOR_TYPE_INT("__clang_svuint8x4_t",  "svuint8x4_t",  SveUint8x4,  SveUint8x4Ty, 16,  8, 4, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint16x4_t", "svuint16x4_t", SveUint16x4, SveUint16x4Ty, 8, 16, 4, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint32x4_t", "svuint32x4_t", SveUint32x4, SveUint32x4Ty, 4, 32, 4, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint64x4_t", "svuint64x4_t", SveUint64x4, SveUint64x4Ty, 2, 64, 4, false)
 
-SVE_VECTOR_TYPE("__clang_svfloat16x4_t", "svfloat16x4_t", SveFloat16x4, SveFloat16x4Ty, 32, 16, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat32x4_t", "svfloat32x4_t", SveFloat32x4, SveFloat32x4Ty, 16, 32, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, SveFloat64x4Ty, 8, 64, true, true, false)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x4_t", "svfloat16x4_t", SveFloat16x4, SveFloat16x4Ty, 8, 16, 4)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x4_t", "svfloat32x4_t", SveFloat32x4, SveFloat32x4Ty, 4, 32, 4)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, SveFloat64x4Ty, 2, 64, 4)
 
-SVE_VECTOR_TYPE("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 32, 16, true, false, true)
+SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4)
 
-SVE_PREDICATE_TYPE("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16)
-SVE_PREDICATE_TYPE("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 32)
-SVE_PREDICATE_TYPE("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4Ty, 64)
+SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1)
+SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 16, 2)
+SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4Ty, 16, 4)
 
 SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy)
 
 #undef SVE_VECTOR_TYPE
+#undef SVE_VECTOR_TYPE_BFLOAT
+#undef SVE_VECTOR_TYPE_FLOAT
+#undef SVE_VECTOR_TYPE_INT
 #undef SVE_PREDICATE_TYPE
+#undef SVE_PREDICATE_TYPE_ALL
 #undef SVE_OPAQUE_TYPE
 #undef SVE_TYPE
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 546e5100b79dd..9f72456d2da67 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -6690,6 +6690,20 @@ When the Owner's lifetime ends, it will consider the Pointer to be dangling.
     P.getInt(); // P is dangling
   }
 
+If a template class is annotated with ``[[gsl::Owner]]``, and the first
+instantiated template argument is a pointer type (raw pointer, or ``[[gsl::Pointer]]``),
+the analysis will consider the instantiated class as a container of the pointer.
+When constructing such an object from a GSL owner object, the analysis will
+assume that the container holds a pointer to the owner object. Consequently,
+when the owner object is destroyed, the pointer will be considered dangling.
+
+.. code-block:: c++
+
+   int f() {
+     std::vector<std::string_view> v = {std::string()}; // v holds a dangling pointer.
+     std::optional<std::string_view> o = std::string(); // o holds a dangling pointer.
+   }
+
 }];
 }
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index fa9cc38efc466..8ece39a383046 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2203,13 +2203,12 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     // Because the length is only known at runtime, we use a dummy value
     // of 0 for the static length.  The alignment values are those defined
     // by the Procedure Call Standard for the Arm Architecture.
-#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits,    \
-                        IsSigned, IsFP, IsBF)                                  \
+#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                    \
   case BuiltinType::Id:                                                        \
     Width = 0;                                                                 \
     Align = 128;                                                               \
     break;
-#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls)         \
+#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
   case BuiltinType::Id:                                                        \
     Width = 0;                                                                 \
     Align = 16;                                                                \
@@ -4284,108 +4283,27 @@ ASTContext::getBuiltinVectorTypeInfo(const BuiltinType *Ty) const {
   switch (Ty->getKind()) {
   default:
     llvm_unreachable("Unsupported builtin vector type");
-  case BuiltinType::SveInt8:
-    return SVE_INT_ELTTY(8, 16, true, 1);
-  case BuiltinType::SveUint8:
-    return SVE_INT_ELTTY(8, 16, false, 1);
-  case BuiltinType::SveInt8x2:
-    return SVE_INT_ELTTY(8, 16, true, 2);
-  case BuiltinType::SveUint8x2:
-    return SVE_INT_ELTTY(8, 16, false, 2);
-  case BuiltinType::SveInt8x3:
-    return SVE_INT_ELTTY(8, 16, true, 3);
-  case BuiltinType::SveUint8x3:
-    return SVE_INT_ELTTY(8, 16, false, 3);
-  case BuiltinType::SveInt8x4:
-    return SVE_INT_ELTTY(8, 16, true, 4);
-  case BuiltinType::SveUint8x4:
-    return SVE_INT_ELTTY(8, 16, false, 4);
-  case BuiltinType::SveInt16:
-    return SVE_INT_ELTTY(16, 8, true, 1);
-  case BuiltinType::SveUint16:
-    return SVE_INT_ELTTY(16, 8, false, 1);
-  case BuiltinType::SveInt16x2:
-    return SVE_INT_ELTTY(16, 8, true, 2);
-  case BuiltinType::SveUint16x2:
-    return SVE_INT_ELTTY(16, 8, false, 2);
-  case BuiltinType::SveInt16x3:
-    return SVE_INT_ELTTY(16, 8, true, 3);
-  case BuiltinType::SveUint16x3:
-    return SVE_INT_ELTTY(16, 8, false, 3);
-  case BuiltinType::SveInt16x4:
-    return SVE_INT_ELTTY(16, 8, true, 4);
-  case BuiltinType::SveUint16x4:
-    return SVE_INT_ELTTY(16, 8, false, 4);
-  case BuiltinType::SveInt32:
-    return SVE_INT_ELTTY(32, 4, true, 1);
-  case BuiltinType::SveUint32:
-    return SVE_INT_ELTTY(32, 4, false, 1);
-  case BuiltinType::SveInt32x2:
-    return SVE_INT_ELTTY(32, 4, true, 2);
-  case BuiltinType::SveUint32x2:
-    return SVE_INT_ELTTY(32, 4, false, 2);
-  case BuiltinType::SveInt32x3:
-    return SVE_INT_ELTTY(32, 4, true, 3);
-  case BuiltinType::SveUint32x3:
-    return SVE_INT_ELTTY(32, 4, false, 3);
-  case BuiltinType::SveInt32x4:
-    return SVE_INT_ELTTY(32, 4, true, 4);
-  case BuiltinType::SveUint32x4:
-    return SVE_INT_ELTTY(32, 4, false, 4);
-  case BuiltinType::SveInt64:
-    return SVE_INT_ELTTY(64, 2, true, 1);
-  case BuiltinType::SveUint64:
-    return SVE_INT_ELTTY(64, 2, false, 1);
-  case BuiltinType::SveInt64x2:
-    return SVE_INT_ELTTY(64, 2, true, 2);
-  case BuiltinType::SveUint64x2:
-    return SVE_INT_ELTTY(64, 2, false, 2);
-  case BuiltinType::SveInt64x3:
-    return SVE_INT_ELTTY(64, 2, true, 3);
-  case BuiltinType::SveUint64x3:
-    return SVE_INT_ELTTY(64, 2, false, 3);
-  case BuiltinType::SveInt64x4:
-    return SVE_INT_ELTTY(64, 2, true, 4);
-  case BuiltinType::SveUint64x4:
-    return SVE_INT_ELTTY(64, 2, false, 4);
-  case BuiltinType::SveBool:
-    return SVE_ELTTY(BoolTy, 16, 1);
-  case BuiltinType::SveBoolx2:
-    return SVE_ELTTY(BoolTy, 16, 2);
-  case BuiltinType::SveBoolx4:
-    return SVE_ELTTY(BoolTy, 16, 4);
-  case BuiltinType::SveFloat16:
-    return SVE_ELTTY(HalfTy, 8, 1);
-  case BuiltinType::SveFloat16x2:
-    return SVE_ELTTY(HalfTy, 8, 2);
-  case BuiltinType::SveFloat16x3:
-    return SVE_ELTTY(HalfTy, 8, 3);
-  case BuiltinType::SveFloat16x4:
-    return SVE_ELTTY(HalfTy, 8, 4);
-  case BuiltinType::SveFloat32:
-    return SVE_ELTTY(FloatTy, 4, 1);
-  case BuiltinType::SveFloat32x2:
-    return SVE_ELTTY(FloatTy, 4, 2);
-  case BuiltinType::SveFloat32x3:
-    return SVE_ELTTY(FloatTy, 4, 3);
-  case BuiltinType::SveFloat32x4:
-    return SVE_ELTTY(FloatTy, 4, 4);
-  case BuiltinType::SveFloat64:
-    return SVE_ELTTY(DoubleTy, 2, 1);
-  case BuiltinType::SveFloat64x2:
-    return SVE_ELTTY(DoubleTy, 2, 2);
-  case BuiltinType::SveFloat64x3:
-    return SVE_ELTTY(DoubleTy, 2, 3);
-  case BuiltinType::SveFloat64x4:
-    return SVE_ELTTY(DoubleTy, 2, 4);
-  case BuiltinType::SveBFloat16:
-    return SVE_ELTTY(BFloat16Ty, 8, 1);
-  case BuiltinType::SveBFloat16x2:
-    return SVE_ELTTY(BFloat16Ty, 8, 2);
-  case BuiltinType::SveBFloat16x3:
-    return SVE_ELTTY(BFloat16Ty, 8, 3);
-  case BuiltinType::SveBFloat16x4:
-    return SVE_ELTTY(BFloat16Ty, 8, 4);
+
+#define SVE_VECTOR_TYPE_INT(Name, MangledName, Id, SingletonId, NumEls,        \
+                            ElBits, NF, IsSigned)                              \
+  case BuiltinType::Id:                                                        \
+    return {getIntTypeForBitwidth(ElBits, IsSigned),                           \
+            llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls,      \
+                              ElBits, NF)                                      \
+  case BuiltinType::Id:                                                        \
+    return {ElBits == 16 ? HalfTy : (ElBits == 32 ? FloatTy : DoubleTy),       \
+            llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_VECTOR_TYPE_BFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  case BuiltinType::Id:                                                        \
+    return {BFloat16Ty, llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
+  case BuiltinType::Id:                                                        \
+    return {BoolTy, llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+#include "clang/Basic/AArch64SVEACLETypes.def"
+
 #define RVV_VECTOR_TYPE_INT(Name, Id, SingletonId, NumEls, ElBits, NF,         \
                             IsSigned)                                          \
   case BuiltinType::Id:                                                        \
@@ -4425,22 +4343,30 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts,
                                            unsigned NumFields) const {
   if (Target->hasAArch64SVETypes()) {
     uint64_t EltTySize = getTypeSize(EltTy);
-#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits,    \
-                        IsSigned, IsFP, IsBF)                                  \
-  if (!EltTy->isBooleanType() &&                                               \
-      ((EltTy->hasIntegerRepresentation() &&                                   \
-        EltTy->hasSignedIntegerRepresentation() == IsSigned) ||                \
-       (EltTy->hasFloatingRepresentation() && !EltTy->isBFloat16Type() &&      \
-        IsFP && !IsBF) ||                                                      \
-       (EltTy->hasFloatingRepresentation() && EltTy->isBFloat16Type() &&       \
-        IsBF && !IsFP)) &&                                                     \
-      EltTySize == ElBits && NumElts == NumEls) {                              \
+
+#define SVE_VECTOR_TYPE_INT(Name, MangledName, Id, SingletonId, NumEls,        \
+                            ElBits, NF, IsSigned)                              \
+  if (EltTy->hasIntegerRepresentation() && !EltTy->isBooleanType() &&          \
+      EltTy->hasSignedIntegerRepresentation() == IsSigned &&                   \
+      EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) {     \
     return SingletonId;                                                        \
   }
-#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls)         \
-  if (EltTy->isBooleanType() && NumElts == NumEls)                             \
+#define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls,      \
+                              ElBits, NF)                                      \
+  if (EltTy->hasFloatingRepresentation() && !EltTy->isBFloat16Type() &&        \
+      EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) {     \
+    return SingletonId;                                                        \
+  }
+#define SVE_VECTOR_TYPE_BFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  if (EltTy->hasFloatingRepresentation() && EltTy->isBFloat16Type() &&         \
+      EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) {     \
+    return SingletonId;                                                        \
+  }
+#define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
+  if (EltTy->isBooleanType() && NumElts == (NumEls * NF) && NumFields == 1)    \
     return SingletonId;
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingleTonId)
+#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
   } else if (Target->hasRISCVVTypes()) {
     uint64_t EltTySize = getTypeSize(EltTy);
diff --git a/clang/lib/AST/ByteCode/EvaluationResult.cpp b/clang/lib/AST/ByteCode/EvaluationResult.cpp
index bdebd19af9f94..627d4b2f65be9 100644
--- a/clang/lib/AST/ByteCode/EvaluationResult.cpp
+++ b/clang/lib/AST/ByteCode/EvaluationResult.cpp
@@ -178,8 +178,8 @@ bool EvaluationResult::checkFullyInitialized(InterpState &S,
 static void collectBlocks(const Pointer &Ptr,
                           llvm::SetVector<const Block *> &Blocks) {
   auto isUsefulPtr = [](const Pointer &P) -> bool {
-    return P.isLive() && !P.isZero() && !P.isDummy() &&
-           !P.isUnknownSizeArray() && !P.isOnePastEnd() && P.isBlockPointer();
+    return P.isLive() && !P.isZero() && !P.isDummy() && P.isDereferencable() &&
+           !P.isUnknownSizeArray() && !P.isOnePastEnd();
   };
 
   if (!isUsefulPtr(Ptr))
diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp
index 6830a7b37f1da..28e189bb339e6 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.cpp
+++ b/clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -207,31 +207,40 @@ Pointer InterpFrame::getParamPointer(unsigned Off) {
   return Pointer(B);
 }
 
+static bool funcHasUsableBody(const Function *F) {
+  assert(F);
+
+  if (F->isConstructor() || F->isDestructor())
+    return true;
+
+  return !F->getDecl()->isImplicit();
+}
+
 SourceInfo InterpFrame::getSource(CodePtr PC) const {
   // Implicitly created functions don't have any code we could point at,
   // so return the call site.
-  if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller)
+  if (Func && !funcHasUsableBody(Func) && Caller)
     return Caller->getSource(RetPC);
 
   return S.getSource(Func, PC);
 }
 
 const Expr *InterpFrame::getExpr(CodePtr PC) const {
-  if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller)
-    return Caller->getExpr(RetPC);
+  if (Func && !funcHasUsableBody(Func) && Caller)
+    return Caller->getExpr(PC);
 
   return S.getExpr(Func, PC);
 }
 
 SourceLocation InterpFrame::getLocation(CodePtr PC) const {
-  if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller)
+  if (Func && !funcHasUsableBody(Func) && Caller)
     return Caller->getLocation(RetPC);
 
   return S.getLocation(Func, PC);
 }
 
 SourceRange InterpFrame::getRange(CodePtr PC) const {
-  if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller)
+  if (Func && !funcHasUsableBody(Func) && Caller)
     return Caller->getRange(RetPC);
 
   return S.getRange(Func, PC);
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 7d638befcbd3f..b6e1da0c3192d 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3384,8 +3384,7 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
   // The SVE types are effectively target-specific.  The mangling scheme
   // is defined in the appendices to the Procedure Call Standard for the
   // Arm Architecture.
-#define SVE_VECTOR_TYPE(InternalName, MangledName, Id, SingletonId, NumEls,    \
-                        ElBits, IsSigned, IsFP, IsBF)                          \
+#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                    \
   case BuiltinType::Id:                                                        \
     if (T->getKind() == BuiltinType::SveBFloat16 &&                            \
         isCompatibleWith(LangOptions::ClangABI::Ver17)) {                      \
@@ -3394,21 +3393,18 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
       Out << "u" << type_name.size() << type_name;                             \
     } else {                                                                   \
       type_name = MangledName;                                                 \
-      Out << (type_name == InternalName ? "u" : "") << type_name.size()        \
-          << type_name;                                                        \
+      Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;  \
     }                                                                          \
     break;
-#define SVE_PREDICATE_TYPE(InternalName, MangledName, Id, SingletonId, NumEls) \
+#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
   case BuiltinType::Id:                                                        \
     type_name = MangledName;                                                   \
-    Out << (type_name == InternalName ? "u" : "") << type_name.size()          \
-        << type_name;                                                          \
+    Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
     break;
-#define SVE_OPAQUE_TYPE(InternalName, MangledName, Id, SingletonId)            \
+#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)                    \
   case BuiltinType::Id:                                                        \
     type_name = MangledName;                                                   \
-    Out << (type_name == InternalName ? "u" : "") << type_name.size()          \
-        << type_name;                                                          \
+    Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
     break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size) \
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index add6a5d10d61f..be627a6242eb4 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -2077,7 +2077,7 @@ void TypePrinter::printHLSLAttributedResourceAfter(
      << HLSLResourceClassAttr::ConvertResourceClassToStr(Attrs.ResourceClass)
      << ")]]";
   if (Attrs.IsROV)
-    OS << " [[hlsl::is_rov()]]";
+    OS << " [[hlsl::is_rov]]";
 }
 
 void TypePrinter::printObjCInterfaceBefore(const ObjCInterfaceType *T,
diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index adc7cdbfded88..6280e9465ecba 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -1699,11 +1699,18 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
   llvm::Value *EmitIvarOffset(CodeGenFunction &CGF,
                               const ObjCInterfaceDecl *Interface,
                               const ObjCIvarDecl *Ivar) override {
-    const std::string Name = GetIVarOffsetVariableName(Ivar->getContainingInterface(), Ivar);
+    const ObjCInterfaceDecl *ContainingInterface =
+        Ivar->getContainingInterface();
+    const std::string Name =
+        GetIVarOffsetVariableName(ContainingInterface, Ivar);
     llvm::GlobalVariable *IvarOffsetPointer = TheModule.getNamedGlobal(Name);
-    if (!IvarOffsetPointer)
+    if (!IvarOffsetPointer) {
       IvarOffsetPointer = new llvm::GlobalVariable(TheModule, IntTy, false,
               llvm::GlobalValue::ExternalLinkage, nullptr, Name);
+      if (Ivar->getAccessControl() != ObjCIvarDecl::Private &&
+          Ivar->getAccessControl() != ObjCIvarDecl::Package)
+        CGM.setGVProperties(IvarOffsetPointer, ContainingInterface);
+    }
     CharUnits Align = CGM.getIntAlign();
     llvm::Value *Offset =
         CGF.Builder.CreateAlignedLoad(IntTy, IvarOffsetPointer, Align);
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 11a577bbdd078..5eebd8ad2a065 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -500,63 +500,19 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     case BuiltinType::OCLReserveID:
       ResultType = CGM.getOpenCLRuntime().convertOpenCLSpecificType(Ty);
       break;
-    case BuiltinType::SveInt8:
-    case BuiltinType::SveUint8:
-    case BuiltinType::SveInt8x2:
-    case BuiltinType::SveUint8x2:
-    case BuiltinType::SveInt8x3:
-    case BuiltinType::SveUint8x3:
-    case BuiltinType::SveInt8x4:
-    case BuiltinType::SveUint8x4:
-    case BuiltinType::SveInt16:
-    case BuiltinType::SveUint16:
-    case BuiltinType::SveInt16x2:
-    case BuiltinType::SveUint16x2:
-    case BuiltinType::SveInt16x3:
-    case BuiltinType::SveUint16x3:
-    case BuiltinType::SveInt16x4:
-    case BuiltinType::SveUint16x4:
-    case BuiltinType::SveInt32:
-    case BuiltinType::SveUint32:
-    case BuiltinType::SveInt32x2:
-    case BuiltinType::SveUint32x2:
-    case BuiltinType::SveInt32x3:
-    case BuiltinType::SveUint32x3:
-    case BuiltinType::SveInt32x4:
-    case BuiltinType::SveUint32x4:
-    case BuiltinType::SveInt64:
-    case BuiltinType::SveUint64:
-    case BuiltinType::SveInt64x2:
-    case BuiltinType::SveUint64x2:
-    case BuiltinType::SveInt64x3:
-    case BuiltinType::SveUint64x3:
-    case BuiltinType::SveInt64x4:
-    case BuiltinType::SveUint64x4:
-    case BuiltinType::SveBool:
-    case BuiltinType::SveBoolx2:
-    case BuiltinType::SveBoolx4:
-    case BuiltinType::SveFloat16:
-    case BuiltinType::SveFloat16x2:
-    case BuiltinType::SveFloat16x3:
-    case BuiltinType::SveFloat16x4:
-    case BuiltinType::SveFloat32:
-    case BuiltinType::SveFloat32x2:
-    case BuiltinType::SveFloat32x3:
-    case BuiltinType::SveFloat32x4:
-    case BuiltinType::SveFloat64:
-    case BuiltinType::SveFloat64x2:
-    case BuiltinType::SveFloat64x3:
-    case BuiltinType::SveFloat64x4:
-    case BuiltinType::SveBFloat16:
-    case BuiltinType::SveBFloat16x2:
-    case BuiltinType::SveBFloat16x3:
-    case BuiltinType::SveBFloat16x4: {
-      ASTContext::BuiltinVectorTypeInfo Info =
-          Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
-      return llvm::ScalableVectorType::get(ConvertType(Info.ElementType),
-                                           Info.EC.getKnownMinValue() *
-                                               Info.NumVectors);
-    }
+#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                    \
+  case BuiltinType::Id:
+#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
+  case BuiltinType::Id:
+#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+#include "clang/Basic/AArch64SVEACLETypes.def"
+      {
+        ASTContext::BuiltinVectorTypeInfo Info =
+            Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
+        return llvm::ScalableVectorType::get(ConvertType(Info.ElementType),
+                                             Info.EC.getKnownMinValue() *
+                                                 Info.NumVectors);
+      }
     case BuiltinType::SveCount:
       return llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
 #define PPC_VECTOR_TYPE(Name, Id, Size) \
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index f1507ebb9a506..c8e703036c132 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -267,6 +267,26 @@ static bool isInStlNamespace(const Decl *D) {
   return DC->isStdNamespace();
 }
 
+// Returns true if the given Record decl is a form of `GSLOwner<Pointer>`
+// type, e.g. std::vector<string_view>, std::optional<string_view>.
+static bool isContainerOfPointer(const RecordDecl *Container) {
+  if (const auto *CTSD =
+          dyn_cast_if_present<ClassTemplateSpecializationDecl>(Container)) {
+    if (!CTSD->hasAttr<OwnerAttr>()) // Container must be a GSL owner type.
+      return false;
+    const auto &TAs = CTSD->getTemplateArgs();
+    return TAs.size() > 0 && TAs[0].getKind() == TemplateArgument::Type &&
+           (isRecordWithAttr<PointerAttr>(TAs[0].getAsType()) ||
+            TAs[0].getAsType()->isPointerType());
+  }
+  return false;
+}
+
+static bool isGSLOwner(QualType T) {
+  return isRecordWithAttr<OwnerAttr>(T) &&
+         !isContainerOfPointer(T->getAsRecordDecl());
+}
+
 static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
   if (auto *Conv = dyn_cast_or_null<CXXConversionDecl>(Callee))
     if (isRecordWithAttr<PointerAttr>(Conv->getConversionType()))
@@ -275,7 +295,7 @@ static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
     return false;
   if (!isRecordWithAttr<PointerAttr>(
           Callee->getFunctionObjectParameterType()) &&
-      !isRecordWithAttr<OwnerAttr>(Callee->getFunctionObjectParameterType()))
+      !isGSLOwner(Callee->getFunctionObjectParameterType()))
     return false;
   if (Callee->getReturnType()->isPointerType() ||
       isRecordWithAttr<PointerAttr>(Callee->getReturnType())) {
@@ -413,7 +433,7 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
     // Once we initialized a value with a non gsl-owner reference, it can no
     // longer dangle.
     if (ReturnType->isReferenceType() &&
-        !isRecordWithAttr<OwnerAttr>(ReturnType->getPointeeType())) {
+        !isGSLOwner(ReturnType->getPointeeType())) {
       for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) {
         if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit ||
             PE.Kind == IndirectLocalPathEntry::LifetimeBoundCall)
@@ -468,12 +488,17 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
     if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr<LifetimeBoundAttr>())
       VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]);
     else if (EnableGSLAnalysis && I == 0) {
+      // Perform GSL analysis for the first argument
       if (shouldTrackFirstArgument(Callee)) {
         VisitGSLPointerArg(Callee, Args[0]);
-      } else if (auto *CCE = dyn_cast<CXXConstructExpr>(Call);
-                 CCE &&
-                 CCE->getConstructor()->getParent()->hasAttr<PointerAttr>()) {
-        VisitGSLPointerArg(CCE->getConstructor(), Args[0]);
+      } else if (auto *Ctor = dyn_cast<CXXConstructExpr>(Call)) {
+        const auto *ClassD = Ctor->getConstructor()->getParent();
+        // Two cases:
+        //  a GSL pointer, e.g. std::string_view
+        //  a container of GSL pointer, e.g. std::vector<string_view>
+        if (ClassD->hasAttr<PointerAttr>() ||
+            (isContainerOfPointer(ClassD) && Callee->getNumParams() == 1))
+          VisitGSLPointerArg(Ctor->getConstructor(), Args[0]);
       }
     }
   }
@@ -990,13 +1015,12 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
         //   int &p = *localUniquePtr;
         //   someContainer.add(std::move(localUniquePtr));
         //   return p;
-        IsLocalGslOwner = isRecordWithAttr<OwnerAttr>(L->getType());
+        IsLocalGslOwner = isGSLOwner(L->getType());
         if (pathContainsInit(Path) || !IsLocalGslOwner)
           return false;
       } else {
         IsGslPtrValueFromGslTempOwner =
-            MTE && !MTE->getExtendingDecl() &&
-            isRecordWithAttr<OwnerAttr>(MTE->getType());
+            MTE && !MTE->getExtendingDecl() && isGSLOwner(MTE->getType());
         // Skipping a chain of initializing gsl::Pointer annotated objects.
         // We are looking only for the final source to find out if it was
         // a local or temporary owner or the address of a local variable/param.
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 14feafd1e6b17..a14a086731c13 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -5171,7 +5171,8 @@ static bool HasNonDeletedDefaultedEqualityComparison(Sema &S,
 
     // const ClassT& obj;
     OpaqueValueExpr Operand(
-        {}, Decl->getTypeForDecl()->getCanonicalTypeUnqualified().withConst(),
+        KeyLoc,
+        Decl->getTypeForDecl()->getCanonicalTypeUnqualified().withConst(),
         ExprValueKind::VK_LValue);
     UnresolvedSet<16> Functions;
     // obj == obj;
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index f2158226e6ca7..4e44813fe515c 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -592,6 +592,10 @@ bool clang::CreateHLSLAttributedResourceType(Sema &S, QualType Wrapped,
       break;
     }
     case attr::HLSLROV:
+      if (ResAttrs.IsROV) {
+        S.Diag(A->getLocation(), diag::warn_duplicate_attribute_exact) << A;
+        return false;
+      }
       ResAttrs.IsROV = true;
       break;
     default:
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index cf207be33175c..e1fc9cea1eb2b 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -1210,6 +1210,10 @@ ExprResult SemaOpenACC::CheckReductionVar(Expr *VarExpr) {
 
 void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
                                  SourceLocation DirLoc) {
+  // Start an evaluation context to parse the clause arguments on.
+  SemaRef.PushExpressionEvaluationContext(
+      Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
+
   switch (K) {
   case OpenACCDirectiveKind::Invalid:
     // Nothing to do here, an invalid kind has nothing we can check here.  We
@@ -1626,6 +1630,8 @@ ExprResult SemaOpenACC::ActOnArraySectionExpr(Expr *Base, SourceLocation LBLoc,
 
 bool SemaOpenACC::ActOnStartStmtDirective(OpenACCDirectiveKind K,
                                           SourceLocation StartLoc) {
+  SemaRef.DiscardCleanupsInEvaluationContext();
+  SemaRef.PopExpressionEvaluationContext();
   return diagnoseConstructAppertainment(*this, K, StartLoc, /*IsStmt=*/true);
 }
 
@@ -1649,6 +1655,7 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K,
         ParentlessLoopConstructs);
 
     ParentlessLoopConstructs.clear();
+
     return ComputeConstruct;
   }
   case OpenACCDirectiveKind::Loop: {
@@ -1704,6 +1711,11 @@ StmtResult SemaOpenACC::ActOnAssociatedStmt(SourceLocation DirectiveLoc,
 
 bool SemaOpenACC::ActOnStartDeclDirective(OpenACCDirectiveKind K,
                                           SourceLocation StartLoc) {
+  // OpenCC3.3 2.1 (line 889)
+  // A program must not depend on the order of evaluation of expressions in
+  // clause arguments or on any side effects of the evaluations.
+  SemaRef.DiscardCleanupsInEvaluationContext();
+  SemaRef.PopExpressionEvaluationContext();
   return diagnoseConstructAppertainment(*this, K, StartLoc, /*IsStmt=*/false);
 }
 
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 4c88159ea4ced..562c57a41299a 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -5502,10 +5502,6 @@ static TemplateDeductionResult CheckDeductionConsistency(
     ArrayRef<TemplateArgument> DeducedArgs, bool CheckConsistency) {
   MultiLevelTemplateArgumentList MLTAL(FTD, DeducedArgs,
                                        /*Final=*/true);
-  if (ArgIdx != -1)
-    if (auto *MD = dyn_cast<CXXMethodDecl>(FTD->getTemplatedDecl());
-        MD && MD->isImplicitObjectMemberFunction())
-      ArgIdx -= 1;
   Sema::ArgumentPackSubstitutionIndexRAII PackIndex(
       S, ArgIdx != -1 ? ::getPackIndexForParam(S, FTD, MLTAL, ArgIdx) : -1);
   bool IsIncompleteSubstitution = false;
@@ -5576,12 +5572,10 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
 
 /// Determine whether the function template \p FT1 is at least as
 /// specialized as \p FT2.
-static bool isAtLeastAsSpecializedAs(Sema &S, SourceLocation Loc,
-                                     FunctionTemplateDecl *FT1,
-                                     FunctionTemplateDecl *FT2,
-                                     TemplatePartialOrderingContext TPOC,
-                                     ArrayRef<QualType> Args1,
-                                     ArrayRef<QualType> Args2) {
+static bool isAtLeastAsSpecializedAs(
+    Sema &S, SourceLocation Loc, FunctionTemplateDecl *FT1,
+    FunctionTemplateDecl *FT2, TemplatePartialOrderingContext TPOC,
+    ArrayRef<QualType> Args1, ArrayRef<QualType> Args2, bool Args1Offset) {
   FunctionDecl *FD1 = FT1->getTemplatedDecl();
   FunctionDecl *FD2 = FT2->getTemplatedDecl();
   const FunctionProtoType *Proto1 = FD1->getType()->getAs<FunctionProtoType>();
@@ -5676,6 +5670,8 @@ static bool isAtLeastAsSpecializedAs(Sema &S, SourceLocation Loc,
                       TemplateDeductionInfo &Info,
                       SmallVectorImpl<DeducedTemplateArgument> &Deduced,
                       PartialOrderingKind) {
+                    if (ArgIdx != -1)
+                      ArgIdx -= Args1Offset;
                     return ::CheckDeductionConsistency(
                         S, FTD, ArgIdx, P, A, DeducedArgs,
                         /*CheckConsistency=*/HasDeducedParam[ParamIdx]);
@@ -5763,6 +5759,8 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate(
   const FunctionDecl *FD2 = FT2->getTemplatedDecl();
   bool ShouldConvert1 = false;
   bool ShouldConvert2 = false;
+  bool Args1Offset = false;
+  bool Args2Offset = false;
   QualType Obj1Ty;
   QualType Obj2Ty;
   if (TPOC == TPOC_Call) {
@@ -5811,6 +5809,7 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate(
         Obj1Ty = GetImplicitObjectParameterType(this->Context, Method1,
                                                 RawObj1Ty, IsRValRef2);
         Args1.push_back(Obj1Ty);
+        Args1Offset = true;
       }
       if (ShouldConvert2) {
         bool IsRValRef1 =
@@ -5821,6 +5820,7 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate(
         Obj2Ty = GetImplicitObjectParameterType(this->Context, Method2,
                                                 RawObj2Ty, IsRValRef1);
         Args2.push_back(Obj2Ty);
+        Args2Offset = true;
       }
     } else {
       if (NonStaticMethod1 && Method1->hasCXXExplicitFunctionObjectParameter())
@@ -5842,10 +5842,10 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate(
   } else {
     assert(!Reversed && "Only call context could have reversed arguments");
   }
-  bool Better1 =
-      isAtLeastAsSpecializedAs(*this, Loc, FT1, FT2, TPOC, Args1, Args2);
-  bool Better2 =
-      isAtLeastAsSpecializedAs(*this, Loc, FT2, FT1, TPOC, Args2, Args1);
+  bool Better1 = isAtLeastAsSpecializedAs(*this, Loc, FT1, FT2, TPOC, Args1,
+                                          Args2, Args2Offset);
+  bool Better2 = isAtLeastAsSpecializedAs(*this, Loc, FT2, FT1, TPOC, Args2,
+                                          Args1, Args1Offset);
   // C++ [temp.deduct.partial]p10:
   //   F is more specialized than G if F is at least as specialized as G and G
   //   is not at least as specialized as F.
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 520dce870b7b7..e627fee51b66b 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -8844,7 +8844,11 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
     }
     case ParsedAttr::AT_HLSLResourceClass:
     case ParsedAttr::AT_HLSLROV: {
-      if (state.getSema().HLSL().handleResourceTypeAttr(attr))
+      // Only collect HLSL resource type attributes that are in
+      // decl-specifier-seq; do not collect attributes on declarations or those
+      // that get to slide after declaration name.
+      if (TAL == TAL_DeclSpec &&
+          state.getSema().HLSL().handleResourceTypeAttr(attr))
         attr.setUsedAsTypeAttr();
       break;
     }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
index 9df108e28ecdb..ecba5f9aa23ee 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
@@ -67,6 +67,32 @@ class DerefFuncDeleteExprVisitor
     const Decl *D = CE->getCalleeDecl();
     if (D && D->hasBody())
       return VisitBody(D->getBody());
+    else {
+      auto name = safeGetName(D);
+      if (name == "ensureOnMainThread" || name == "ensureOnMainRunLoop") {
+        for (unsigned i = 0; i < CE->getNumArgs(); ++i) {
+          auto *Arg = CE->getArg(i);
+          if (VisitLabmdaArgument(Arg))
+            return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  bool VisitLabmdaArgument(const Expr *E) {
+    E = E->IgnoreParenCasts();
+    if (auto *TempE = dyn_cast<CXXBindTemporaryExpr>(E))
+      E = TempE->getSubExpr();
+    if (auto *ConstructE = dyn_cast<CXXConstructExpr>(E)) {
+      for (unsigned i = 0; i < ConstructE->getNumArgs(); ++i) {
+        auto *Arg = ConstructE->getArg(i);
+        if (auto *Lambda = dyn_cast<LambdaExpr>(Arg)) {
+          if (VisitBody(Lambda->getBody()))
+            return true;
+        }
+      }
+    }
     return false;
   }
 
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 9c9ca23e0a6a6..9fd5eae67a21f 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -968,3 +968,10 @@ namespace FunctionStart {
   static_assert(__builtin_function_start(a) == a, ""); // both-error {{not an integral constant expression}} \
                                                        // both-note {{comparison of addresses of literals has unspecified value}}
 }
+
+namespace BuiltinInImplicitCtor {
+  constexpr struct {
+    int a = __builtin_isnan(1.0);
+  } Foo;
+  static_assert(Foo.a == 0, "");
+}
diff --git a/clang/test/AST/ByteCode/initializer_list.cpp b/clang/test/AST/ByteCode/initializer_list.cpp
index 4e3b8dc912016..f882e4ff1b124 100644
--- a/clang/test/AST/ByteCode/initializer_list.cpp
+++ b/clang/test/AST/ByteCode/initializer_list.cpp
@@ -1,8 +1,6 @@
 // RUN: %clang_cc1 -fexperimental-new-constant-interpreter -fms-extensions -std=c++20 -verify=expected,both %s
 // RUN: %clang_cc1 -std=c++20 -fms-extensions -verify=ref,both %s
 
-// both-no-diagnostics
-
 namespace std {
   typedef decltype(sizeof(int)) size_t;
   template <class _E>
@@ -53,3 +51,21 @@ constexpr int foo() {
 }
 
 static_assert(foo() == 0);
+
+
+namespace rdar13395022 {
+  struct MoveOnly { // both-note {{candidate}}
+    MoveOnly(MoveOnly&&); // both-note 2{{copy constructor is implicitly deleted because}} both-note {{candidate}}
+  };
+
+  void test(MoveOnly mo) {
+    auto &&list1 = {mo}; // both-error {{call to implicitly-deleted copy constructor}} both-note {{in initialization of temporary of type 'std::initializer_list}}
+    MoveOnly (&&list2)[1] = {mo}; // both-error {{call to implicitly-deleted copy constructor}} both-note {{in initialization of temporary of type 'MoveOnly[1]'}}
+    std::initializer_list<MoveOnly> &&list3 = {};
+    MoveOnly (&&list4)[1] = {}; // both-error {{no matching constructor}}
+    // both-note@-1 {{in implicit initialization of array element 0 with omitted initializer}}
+    // both-note@-2 {{in initialization of temporary of type 'MoveOnly[1]' created to list-initialize this reference}}
+  }
+}
+
+
diff --git a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
new file mode 100644
index 0000000000000..01527addb5299
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
@@ -0,0 +1,232 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.RefCntblBaseVirtualDtor -verify %s
+
+#include "mock-types.h"
+
+namespace Detail {
+
+template<typename Out, typename... In>
+class CallableWrapperBase {
+public:
+    virtual ~CallableWrapperBase() { }
+    virtual Out call(In...) = 0;
+};
+
+template<typename, typename, typename...> class CallableWrapper;
+
+template<typename CallableType, typename Out, typename... In>
+class CallableWrapper : public CallableWrapperBase<Out, In...> {
+public:
+    explicit CallableWrapper(CallableType&& callable)
+        : m_callable(WTFMove(callable)) { }
+    CallableWrapper(const CallableWrapper&) = delete;
+    CallableWrapper& operator=(const CallableWrapper&) = delete;
+    Out call(In... in) final;
+private:
+    CallableType m_callable;
+};
+
+} // namespace Detail
+
+template<typename> class Function;
+
+template<typename Out, typename... In> Function<Out(In...)> adopt(Detail::CallableWrapperBase<Out, In...>*);
+
+template <typename Out, typename... In>
+class Function<Out(In...)> {
+public:
+    using Impl = Detail::CallableWrapperBase<Out, In...>;
+
+    Function() = default;
+
+    template<typename FunctionType>
+    Function(FunctionType f);
+
+    Out operator()(In... in) const;
+    explicit operator bool() const { return !!m_callableWrapper; }
+
+private:
+    enum AdoptTag { Adopt };
+    Function(Impl* impl, AdoptTag)
+        : m_callableWrapper(impl)
+    {
+    }
+
+    friend Function adopt<Out, In...>(Impl*);
+
+    Impl* m_callableWrapper;
+};
+
+template<typename Out, typename... In> Function<Out(In...)> adopt(Detail::CallableWrapperBase<Out, In...>* impl)
+{
+    return Function<Out(In...)>(impl, Function<Out(In...)>::Adopt);
+}
+
+template<typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTraits = DefaultRefDerefTraits<T>> Ref<T, PtrTraits, RefDerefTraits> adoptRef(T&);
+
+template<typename T, typename _PtrTraits, typename RefDerefTraits>
+inline Ref<T, _PtrTraits, RefDerefTraits> adoptRef(T& reference)
+{
+    return Ref<T, _PtrTraits, RefDerefTraits>(reference);
+}
+
+enum class DestructionThread : unsigned char { Any, Main, MainRunLoop };
+void ensureOnMainThread(Function<void()>&&); // Sync if called on main thread, async otherwise.
+void ensureOnMainRunLoop(Function<void()>&&); // Sync if called on main run loop, async otherwise.
+
+class ThreadSafeRefCountedBase {
+public:
+    ThreadSafeRefCountedBase() = default;
+
+    void ref() const
+    {
+        ++m_refCount;
+    }
+
+    bool hasOneRef() const
+    {
+        return refCount() == 1;
+    }
+
+    unsigned refCount() const
+    {
+        return m_refCount;
+    }
+
+protected:
+    bool derefBase() const
+    {
+      if (!--m_refCount) {
+          m_refCount = 1;
+          return true;
+      }
+      return false;
+    }
+
+private:
+    mutable unsigned m_refCount { 1 };
+};
+
+template<class T, DestructionThread destructionThread = DestructionThread::Any> class ThreadSafeRefCounted : public ThreadSafeRefCountedBase {
+public:
+    void deref() const
+    {
+        if (!derefBase())
+            return;
+
+        if constexpr (destructionThread == DestructionThread::Any) {
+            delete static_cast<const T*>(this);
+        } else if constexpr (destructionThread == DestructionThread::Main) {
+            ensureOnMainThread([this] {
+                delete static_cast<const T*>(this);
+            });
+        }
+    }
+
+protected:
+    ThreadSafeRefCounted() = default;
+};
+
+class FancyRefCountedClass final : public ThreadSafeRefCounted<FancyRefCountedClass, DestructionThread::Main> {
+public:
+    static Ref<FancyRefCountedClass> create()
+    {
+        return adoptRef(*new FancyRefCountedClass());
+    }
+
+    virtual ~FancyRefCountedClass();
+
+private:
+    FancyRefCountedClass();
+};
+
+template<class T, DestructionThread destructionThread = DestructionThread::Any> class BadThreadSafeRefCounted : public ThreadSafeRefCountedBase {
+public:
+    void deref() const
+    {
+        if (!derefBase())
+            return;
+
+        [this] {
+          delete static_cast<const T*>(this);
+        };
+    }
+
+protected:
+    BadThreadSafeRefCounted() = default;
+};
+
+class FancyRefCountedClass2 final : public ThreadSafeRefCounted<FancyRefCountedClass, DestructionThread::Main> {
+// expected-warning@-1{{Class 'ThreadSafeRefCounted<FancyRefCountedClass, DestructionThread::Main>' is used as a base of class 'FancyRefCountedClass2' but doesn't have virtual destructor}}
+public:
+    static Ref<FancyRefCountedClass2> create()
+    {
+        return adoptRef(*new FancyRefCountedClass2());
+    }
+
+    virtual ~FancyRefCountedClass2();
+
+private:
+    FancyRefCountedClass2();
+};
+
+template<class T, DestructionThread destructionThread = DestructionThread::Any> class NestedThreadSafeRefCounted : public ThreadSafeRefCountedBase {
+public:
+    void deref() const
+    {
+        if (!derefBase())
+            return;
+        ensureOnMainRunLoop([&] {
+          auto destroyThis = [&] {
+            delete static_cast<const T*>(this);
+          };
+          destroyThis();
+        });
+    }
+
+protected:
+    NestedThreadSafeRefCounted() = default;
+};
+
+class FancyRefCountedClass3 final : public NestedThreadSafeRefCounted<FancyRefCountedClass3, DestructionThread::Main> {
+public:
+    static Ref<FancyRefCountedClass3> create()
+    {
+        return adoptRef(*new FancyRefCountedClass3());
+    }
+
+    virtual ~FancyRefCountedClass3();
+
+private:
+    FancyRefCountedClass3();
+};
+
+template<class T, DestructionThread destructionThread = DestructionThread::Any> class BadNestedThreadSafeRefCounted : public ThreadSafeRefCountedBase {
+public:
+    void deref() const
+    {
+        if (!derefBase())
+            return;
+        ensureOnMainThread([&] {
+          auto destroyThis = [&] {
+            delete static_cast<const T*>(this);
+          };
+        });
+    }
+
+protected:
+    BadNestedThreadSafeRefCounted() = default;
+};
+
+class FancyRefCountedClass4 final : public BadNestedThreadSafeRefCounted<FancyRefCountedClass4, DestructionThread::Main> {
+// expected-warning@-1{{Class 'BadNestedThreadSafeRefCounted<FancyRefCountedClass4, DestructionThread::Main>' is used as a base of class 'FancyRefCountedClass4' but doesn't have virtual destructor}}
+public:
+    static Ref<FancyRefCountedClass4> create()
+    {
+        return adoptRef(*new FancyRefCountedClass4());
+    }
+
+    virtual ~FancyRefCountedClass4();
+
+private:
+    FancyRefCountedClass4();
+};
diff --git a/clang/test/CodeGenObjC/dllstorage.m b/clang/test/CodeGenObjC/dllstorage.m
index c94f4c9b5804d..a6c591b2d7930 100644
--- a/clang/test/CodeGenObjC/dllstorage.m
+++ b/clang/test/CodeGenObjC/dllstorage.m
@@ -112,7 +112,7 @@ @interface M : I {
 // CHECK-IR-DAG: @"OBJC_IVAR_$_M._ivar" = external dllimport global i32
 
 // CHECK-NF-DAG: @"$_OBJC_REF_CLASS_M" = external dllimport global ptr
-// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external global i32
+// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external dllimport global i32
 
 __declspec(dllexport)
 __attribute__((__objc_exception__))
@@ -151,7 +151,7 @@ id f(Q *q) {
 
 // CHECK-IR-DAG: @"OBJC_IVAR_$_M._ivar" = external dllimport global i32
 
-// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external global i32
+// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external dllimport global i32
 
 int g(void) {
   @autoreleasepool {
diff --git a/clang/test/Driver/mcmodel.c b/clang/test/Driver/mcmodel.c
index 9681c32579d71..c6c8b5433d23b 100644
--- a/clang/test/Driver/mcmodel.c
+++ b/clang/test/Driver/mcmodel.c
@@ -43,5 +43,4 @@
 // AARCH64-PIC-LARGE: error: invalid argument '-mcmodel=large' only allowed with '-fno-pic'
 // ERR-AARCH64_32: error: unsupported argument 'small' to option '-mcmodel=' for target 'aarch64_32-unknown-linux'
 
-// ERR-LOONGARCH64-PLT-LARGE: error: invalid argument '-mcmodel=large' not allowed with '-fplt'
 // ERR-LOONGARCH64-PLT-EXTREME: error: invalid argument '-mcmodel=extreme' not allowed with '-fplt'
diff --git a/clang/test/Driver/riscv-mcmodel.c b/clang/test/Driver/riscv-mcmodel.c
new file mode 100644
index 0000000000000..4f5fa95f59b66
--- /dev/null
+++ b/clang/test/Driver/riscv-mcmodel.c
@@ -0,0 +1,14 @@
+// RUN: %clang --target=riscv32 -### -c -mcmodel=small %s 2>&1 | FileCheck --check-prefix=SMALL %s
+// RUN: %clang --target=riscv64 -### -c -mcmodel=small %s 2>&1 | FileCheck --check-prefix=SMALL %s
+
+// RUN: %clang --target=riscv32 -### -c -mcmodel=medlow %s 2>&1 | FileCheck --check-prefix=SMALL %s
+// RUN: %clang --target=riscv64 -### -c -mcmodel=medlow %s 2>&1 | FileCheck --check-prefix=SMALL %s
+
+// RUN: %clang --target=riscv32 -### -c -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
+// RUN: %clang --target=riscv64 -### -c -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
+
+// RUN: %clang --target=riscv32 -### -c -mcmodel=medany %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
+// RUN: %clang --target=riscv64 -### -c -mcmodel=medany %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
+
+// SMALL: "-mcmodel=small"
+// MEDIUM: "-mcmodel=medium"
diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
index 24c85c6ccf7d7..cf21ec4d380db 100644
--- a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
 
 // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:6:3, col:68> col:68 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t'
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:6:3, col:68> col:68 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov]]':'__hlsl_resource_t'
 struct MyBuffer {
   __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h;
 };
 
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:10:1, col:66> col:66 res '__hlsl_resource_t {{\[\[}}hlsl::resource_class(SRV)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t'
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:10:1, col:66> col:66 res '__hlsl_resource_t {{\[\[}}hlsl::resource_class(SRV)]] {{\[\[}}hlsl::is_rov]]':'__hlsl_resource_t'
 __hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res;
 
 // CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:14:1, line:16:1> line:14:6 f 'void ()
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:72> col:72 r '__hlsl_resource_t {{\[\[}}hlsl::resource_class(Sampler)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t'
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:72> col:72 r '__hlsl_resource_t {{\[\[}}hlsl::resource_class(Sampler)]] {{\[\[}}hlsl::is_rov]]':'__hlsl_resource_t'
 void f() {
   __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r;
 }
diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
index 68b2d9ecb190a..15685bd1a3baa 100644
--- a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify
 
 // expected-error@+1{{'is_rov' attribute cannot be applied to a declaration}}
-[[hlsl::is_rov()]] __hlsl_resource_t res0;
+[[hlsl::is_rov]] __hlsl_resource_t res0;
 
 // expected-error@+1{{HLSL resource needs to have [[hlsl::resource_class()]] attribute}}
-__hlsl_resource_t [[hlsl::is_rov()]] res1;
+__hlsl_resource_t [[hlsl::is_rov]] res1;
 
 // expected-error@+1{{'is_rov' attribute takes no arguments}}
 __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2;
@@ -12,5 +12,5 @@ __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2;
 // expected-error@+1{{use of undeclared identifier 'gibberish'}}
 __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3;
 
-// duplicate attribute with the same meaning - no error
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov()]] [[hlsl::is_rov()]] res4;
+// expected-warning@+1{{attribute 'is_rov' is already applied}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] [[hlsl::is_rov]] res4;
diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
index 6324a11fc8a2d..7c3830a291970 100644
--- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
@@ -11,6 +11,6 @@ RWBuffer<float> Buffer1;
 // CHECK: -TemplateArgument type 'vector<float, 4>'
 // CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector<float, 4>' 4
 // CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
-// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'vector<float *, 4> {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov()]]':'vector<float *, 4>'
+// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'vector<float *, 4> {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov]]':'vector<float *, 4>'
 // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
 RasterizerOrderedBuffer<vector<float, 4> > BufferArray3[4];
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index 59357d0730a7d..234e06f069074 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -158,17 +158,30 @@ auto begin(C &c) -> decltype(c.begin());
 template<typename T, int N>
 T *begin(T (&array)[N]);
 
+using size_t = decltype(sizeof(0));
+
+template<typename T>
+struct initializer_list {
+  const T* ptr; size_t sz;
+};
 template <typename T>
 struct vector {
   typedef __gnu_cxx::basic_iterator<T> iterator;
   iterator begin();
   iterator end();
   const T *data() const;
+  vector();
+  vector(initializer_list<T> __l);
+  
+  template<typename InputIterator>
+	vector(InputIterator first, InputIterator __last);
+
   T &at(int n);
 };
 
 template<typename T>
 struct basic_string_view {
+  basic_string_view();
   basic_string_view(const T *);
   const T *begin() const;
 };
@@ -203,11 +216,21 @@ template<typename T>
 struct optional {
   optional();
   optional(const T&);
+
+  template<typename U = T>
+	optional(U&& t);
+
+  template<typename U>
+	optional(optional<U>&& __t);
+
   T &operator*() &;
   T &&operator*() &&;
   T &value() &;
   T &&value() &&;
 };
+template<typename T>
+optional<__decay(T)> make_optional(T&&);
+
 
 template<typename T>
 struct stack {
@@ -553,3 +576,57 @@ void test() {
   std::string_view svjkk1 = ReturnStringView(StrCat("bar", "x")); // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}}
 }
 } // namespace GH100549
+
+namespace GH100526 {
+void test() {
+  std::vector<std::string_view> v1({std::string()}); // expected-warning {{object backing the pointer will be destroyed at the end}}
+  std::vector<std::string_view> v2({
+    std::string(), // expected-warning {{object backing the pointer will be destroyed at the end}}
+    std::string_view()
+  });
+  std::vector<std::string_view> v3({
+    std::string_view(),
+    std::string()  // expected-warning {{object backing the pointer will be destroyed at the end}}
+  });
+
+  std::optional<std::string_view> o1 = std::string(); // expected-warning {{object backing the pointer}}
+
+  std::string s;
+  // This is a tricky use-after-free case, what it does:
+  //   1. make_optional creates a temporary "optional<string>"" object
+  //   2. the temporary object owns the underlying string which is copied from s.
+  //   3. the t3 object holds the view to the underlying string of the temporary object.
+  std::optional<std::string_view> o2 = std::make_optional(s); // expected-warning {{object backing the pointer}}
+  std::optional<std::string_view> o3 = std::optional<std::string>(s); // expected-warning {{object backing the pointer}}
+  std::optional<std::string_view> o4 = std::optional<std::string_view>(s); 
+
+  // FIXME: should work for assignment cases
+  v1 = {std::string()};
+  o1 = std::string();
+
+  // no warning on copying pointers.
+  std::vector<std::string_view> n1 = {std::string_view()};
+  std::optional<std::string_view> n2 = {std::string_view()};
+  std::optional<std::string_view> n3 = std::string_view();
+  std::optional<std::string_view> n4 = std::make_optional(std::string_view());
+  const char* b = "";
+  std::optional<std::string_view> n5 = std::make_optional(b);
+  std::optional<std::string_view> n6 = std::make_optional("test");
+}
+
+std::vector<std::string_view> test2(int i) {
+  std::vector<std::string_view> t;
+  if (i)
+    return t; // this is fine, no dangling
+  return std::vector<std::string_view>(t.begin(), t.end());
+}
+
+std::optional<std::string_view> test3(int i) {
+  std::string s;
+  std::string_view sv;
+  if (i)
+   return s; // expected-warning {{address of stack memory associated}}
+  return sv; // fine
+}
+
+} // namespace GH100526
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index b8a9db103782c..91ef7786f11bb 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -4147,6 +4147,24 @@ class Template {};
 // Make sure we don't crash when instantiating a type
 static_assert(!__is_trivially_equality_comparable(Template<Template<int>>));
 
+
+struct S operator==(S, S);
+
+template <class> struct basic_string_view {};
+
+struct basic_string {
+  operator basic_string_view<int>() const;
+};
+
+template <class T>
+const bool is_trivially_equality_comparable = __is_trivially_equality_comparable(T);
+
+template <int = is_trivially_equality_comparable<basic_string> >
+void find();
+
+void func() { find(); }
+
+
 namespace hidden_friend {
 
 struct TriviallyEqualityComparable {
diff --git a/clang/test/SemaObjC/ivar-access-tests.m b/clang/test/SemaObjC/ivar-access-tests.m
index cd7e09d406ada..6060dea5ab0f0 100644
--- a/clang/test/SemaObjC/ivar-access-tests.m
+++ b/clang/test/SemaObjC/ivar-access-tests.m
@@ -2,6 +2,8 @@
 
 @interface MySuperClass
 {
+  int unmarked;
+
 @private
   int private;
 
@@ -17,6 +19,7 @@ @implementation MySuperClass
 - (void) test {
     int access;
     MySuperClass *s = 0;
+    access = s->unmarked;
     access = s->private;   
     access = s->protected;
 }
@@ -30,9 +33,11 @@ @implementation MyClass
 - (void) test {
     int access;
     MySuperClass *s = 0;
+    access = s->unmarked;
     access = s->private; // expected-error {{instance variable 'private' is private}}
     access = s->protected;
     MyClass *m=0;
+    access = m->unmarked;
     access = m->private; // expected-error {{instance variable 'private' is private}}
     access = m->protected;
 }
@@ -46,9 +51,11 @@ @implementation Deeper
 - (void) test {
     int access;
     MySuperClass *s = 0;
+    access = s->unmarked;
     access = s->private; // expected-error {{instance variable 'private' is private}}
     access = s->protected;
     MyClass *m=0;
+    access = m->unmarked;
     access = m->private; // expected-error {{instance variable 'private' is private}}
     access = m->protected;
 }
@@ -61,9 +68,11 @@ @implementation Unrelated
 - (void) test {
     int access;
     MySuperClass *s = 0;
+    access = s->unmarked; // expected-error {{instance variable 'unmarked' is protected}}
     access = s->private; // expected-error {{instance variable 'private' is private}}
     access = s->protected; // expected-error {{instance variable 'protected' is protected}}
     MyClass *m=0;
+    access = m->unmarked; // expected-error {{instance variable 'unmarked' is protected}}
     access = m->private; // expected-error {{instance variable 'private' is private}}
     access = m->protected; // expected-error {{instance variable 'protected' is protected}}
 }
@@ -73,6 +82,7 @@ int main (void)
 {
   MySuperClass *s = 0;
   int access;
+  access = s->unmarked; // expected-error {{instance variable 'unmarked' is protected}}
   access = s->private;   // expected-error {{instance variable 'private' is private}}
   access = s->protected; // expected-error {{instance variable 'protected' is protected}}
   return 0;
diff --git a/clang/test/SemaOpenACC/compute-construct-ast.cpp b/clang/test/SemaOpenACC/compute-construct-ast.cpp
index e632522f877b5..7a33aeb80570c 100644
--- a/clang/test/SemaOpenACC/compute-construct-ast.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-ast.cpp
@@ -117,5 +117,26 @@ struct S {
 void use() {
   TemplFunc<S>();
 }
-#endif
 
+struct HasCtor { HasCtor(); operator int(); ~HasCtor();};
+
+void useCtorType() {
+  // CHECK-LABEL: useCtorType
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc kernels num_workers(HasCtor{})
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} kernels
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator int
+  // CHECK-NEXT: MaterializeTemporaryExpr{{.*}}'HasCtor'
+  // CHECK-NEXT: CXXBindTemporaryExpr{{.*}}'HasCtor'
+  // CHECK-NEXT: CXXTemporaryObjectExpr{{.*}}'HasCtor'
+
+  while(true);
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+}
+#endif
diff --git a/clang/test/SemaTemplate/GH18291.cpp b/clang/test/SemaTemplate/GH18291.cpp
index ca1e69e4ca3f5..820564ffa6f1a 100644
--- a/clang/test/SemaTemplate/GH18291.cpp
+++ b/clang/test/SemaTemplate/GH18291.cpp
@@ -86,4 +86,29 @@ namespace func_pointer {
     template <class _Tp> void pow(_Tp, complex<typename __promote<_Tp>::type>) = delete;
     void (*ptr)(const complex<float> &, complex<float>){pow};
   } // namespace param
-} // namespace t3
+} // namespace func_pointer
+
+namespace static_vs_nonstatic {
+  namespace implicit_obj_param {
+    struct A {
+      template <class... Args>
+        static void f(int a, Args... args) {}
+      template <class... Args>
+        void f(Args... args) = delete;
+    };
+    void g(){
+      A::f(0);
+    }
+  } // namespace implicit_obj_param
+  namespace explicit_obj_param {
+    struct A {
+      template <class... Args>
+        static void f(int, Args... args) {}
+      template <class... Args>
+        void f(this A *, Args... args) = delete;
+    };
+    void g(){
+      A::f(0);
+    }
+  } // namespace explicit_obj_param
+} // namespace static_vs_nonstatic
diff --git a/compiler-rt/lib/orc/dlfcn_wrapper.cpp b/compiler-rt/lib/orc/dlfcn_wrapper.cpp
index fd9dce40d6738..bbbc79f607f27 100644
--- a/compiler-rt/lib/orc/dlfcn_wrapper.cpp
+++ b/compiler-rt/lib/orc/dlfcn_wrapper.cpp
@@ -20,6 +20,7 @@ using namespace orc_rt;
 
 extern "C" const char *__orc_rt_jit_dlerror();
 extern "C" void *__orc_rt_jit_dlopen(const char *path, int mode);
+extern "C" int __orc_rt_jit_dlupdate(void *dso_handle, int mode);
 extern "C" int __orc_rt_jit_dlclose(void *dso_handle);
 
 ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
@@ -41,6 +42,18 @@ __orc_rt_jit_dlopen_wrapper(const char *ArgData, size_t ArgSize) {
       .release();
 }
 
+#ifdef __APPLE__
+ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
+__orc_rt_jit_dlupdate_wrapper(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<int32_t(SPSExecutorAddr, int32_t)>::handle(
+             ArgData, ArgSize,
+             [](ExecutorAddr &DSOHandle, int32_t mode) {
+               return __orc_rt_jit_dlupdate(DSOHandle.toPtr<void *>(), mode);
+             })
+      .release();
+}
+#endif
+
 ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
 __orc_rt_jit_dlclose_wrapper(const char *ArgData, size_t ArgSize) {
   return WrapperFunction<int32_t(SPSExecutorAddr)>::handle(
diff --git a/compiler-rt/lib/orc/macho_platform.cpp b/compiler-rt/lib/orc/macho_platform.cpp
index c092545b2a367..8cc3594b5d0cf 100644
--- a/compiler-rt/lib/orc/macho_platform.cpp
+++ b/compiler-rt/lib/orc/macho_platform.cpp
@@ -331,6 +331,7 @@ class MachOPlatformRuntimeState {
 
   const char *dlerror();
   void *dlopen(std::string_view Name, int Mode);
+  int dlupdate(void *DSOHandle, int Mode);
   int dlclose(void *DSOHandle);
   void *dlsym(void *DSOHandle, const char *Symbol);
 
@@ -380,6 +381,12 @@ class MachOPlatformRuntimeState {
   Error dlopenInitialize(std::unique_lock<std::mutex> &JDStatesLock,
                          JITDylibState &JDS, MachOJITDylibDepInfoMap &DepInfo);
 
+  Error dlupdateImpl(void *DSOHandle, int Mode);
+  Error dlupdateFull(std::unique_lock<std::mutex> &JDStatesLock,
+                     JITDylibState &JDS);
+  Error dlupdateInitialize(std::unique_lock<std::mutex> &JDStatesLock,
+                           JITDylibState &JDS);
+
   Error dlcloseImpl(void *DSOHandle);
   Error dlcloseDeinitialize(std::unique_lock<std::mutex> &JDStatesLock,
                             JITDylibState &JDS);
@@ -789,6 +796,20 @@ void *MachOPlatformRuntimeState::dlopen(std::string_view Path, int Mode) {
   }
 }
 
+int MachOPlatformRuntimeState::dlupdate(void *DSOHandle, int Mode) {
+  ORC_RT_DEBUG({
+    std::string S;
+    printdbg("MachOPlatform::dlupdate(%p) (%s)\n", DSOHandle, S.c_str());
+  });
+  std::lock_guard<std::recursive_mutex> Lock(DyldAPIMutex);
+  if (auto Err = dlupdateImpl(DSOHandle, Mode)) {
+    // FIXME: Make dlerror thread safe.
+    DLFcnError = toString(std::move(Err));
+    return -1;
+  }
+  return 0;
+}
+
 int MachOPlatformRuntimeState::dlclose(void *DSOHandle) {
   ORC_RT_DEBUG({
     auto *JDS = getJITDylibStateByHeader(DSOHandle);
@@ -1244,6 +1265,67 @@ Error MachOPlatformRuntimeState::dlopenInitialize(
   return Error::success();
 }
 
+Error MachOPlatformRuntimeState::dlupdateImpl(void *DSOHandle, int Mode) {
+  std::unique_lock<std::mutex> Lock(JDStatesMutex);
+
+  // Try to find JITDylib state by DSOHandle.
+  auto *JDS = getJITDylibStateByHeader(DSOHandle);
+
+  if (!JDS) {
+    std::ostringstream ErrStream;
+    ErrStream << "No registered JITDylib for " << DSOHandle;
+    return make_error<StringError>(ErrStream.str());
+  }
+
+  if (!JDS->referenced())
+    return make_error<StringError>("dlupdate failed, JITDylib must be open.");
+
+  if (!JDS->Sealed) {
+    if (auto Err = dlupdateFull(Lock, *JDS))
+      return Err;
+  }
+
+  return Error::success();
+}
+
+Error MachOPlatformRuntimeState::dlupdateFull(
+    std::unique_lock<std::mutex> &JDStatesLock, JITDylibState &JDS) {
+  // Call back to the JIT to push the initializers.
+  Expected<MachOJITDylibDepInfoMap> DepInfo((MachOJITDylibDepInfoMap()));
+  // Unlock so that we can accept the initializer update.
+  JDStatesLock.unlock();
+  if (auto Err = WrapperFunction<SPSExpected<SPSMachOJITDylibDepInfoMap>(
+          SPSExecutorAddr)>::
+          call(JITDispatch(&__orc_rt_macho_push_initializers_tag), DepInfo,
+               ExecutorAddr::fromPtr(JDS.Header)))
+    return Err;
+  JDStatesLock.lock();
+
+  if (!DepInfo)
+    return DepInfo.takeError();
+
+  if (auto Err = dlupdateInitialize(JDStatesLock, JDS))
+    return Err;
+
+  return Error::success();
+}
+
+Error MachOPlatformRuntimeState::dlupdateInitialize(
+    std::unique_lock<std::mutex> &JDStatesLock, JITDylibState &JDS) {
+  ORC_RT_DEBUG({
+    printdbg("MachOPlatformRuntimeState::dlupdateInitialize(\"%s\")\n",
+             JDS.Name.c_str());
+  });
+
+  // Initialize this JITDylib.
+  if (auto Err = registerObjCRegistrationObjects(JDStatesLock, JDS))
+    return Err;
+  if (auto Err = runModInits(JDStatesLock, JDS))
+    return Err;
+
+  return Error::success();
+}
+
 Error MachOPlatformRuntimeState::dlcloseImpl(void *DSOHandle) {
   std::unique_lock<std::mutex> Lock(JDStatesMutex);
 
@@ -1517,6 +1599,10 @@ void *__orc_rt_macho_jit_dlopen(const char *path, int mode) {
   return MachOPlatformRuntimeState::get().dlopen(path, mode);
 }
 
+int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode) {
+  return MachOPlatformRuntimeState::get().dlupdate(dso_handle, mode);
+}
+
 int __orc_rt_macho_jit_dlclose(void *dso_handle) {
   return MachOPlatformRuntimeState::get().dlclose(dso_handle);
 }
diff --git a/compiler-rt/lib/orc/macho_platform.h b/compiler-rt/lib/orc/macho_platform.h
index 62234039437c0..ad70c97809d2f 100644
--- a/compiler-rt/lib/orc/macho_platform.h
+++ b/compiler-rt/lib/orc/macho_platform.h
@@ -24,6 +24,7 @@ ORC_RT_INTERFACE void __orc_rt_macho_cxa_finalize(void *dso_handle);
 // dlfcn functions.
 ORC_RT_INTERFACE const char *__orc_rt_macho_jit_dlerror();
 ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlopen(const char *path, int mode);
+ORC_RT_INTERFACE int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode);
 ORC_RT_INTERFACE int __orc_rt_macho_jit_dlclose(void *dso_handle);
 ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlsym(void *dso_handle,
                                                 const char *symbol);
diff --git a/compiler-rt/lib/rtsan/rtsan_context.cpp b/compiler-rt/lib/rtsan/rtsan_context.cpp
index a49b70360babb..8609394fa222f 100644
--- a/compiler-rt/lib/rtsan/rtsan_context.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_context.cpp
@@ -95,10 +95,11 @@ void __rtsan::PrintDiagnostics(const char *intercepted_function_name, uptr pc,
                                uptr bp) {
   ScopedErrorReportLock l;
 
-  fprintf(stderr,
-          "Real-time violation: intercepted call to real-time unsafe function "
-          "`%s` in real-time context! Stack trace:\n",
-          intercepted_function_name);
+  Report("ERROR: RealtimeSanitizer: unsafe-library-call\n");
+  Printf("Intercepted call to real-time unsafe function "
+         "`%s` in real-time context!\n",
+         intercepted_function_name);
+
   __rtsan::PrintStackTrace(pc, bp);
 }
 
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
index 0eeaf9da67098..1ef4c66a28de8 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
@@ -472,11 +472,12 @@ TEST_F(PthreadMutexLockTest, PthreadMutexUnlockSurvivesWhenNotRealtime) {
   ExpectNonRealtimeSurvival(Func);
 }
 
-TEST(TestRtsanInterceptors, PthreadMutexJoinDiesWhenRealtime) {
-  auto Func = []() {
-    pthread_t thread{};
-    pthread_join(thread, nullptr);
-  };
+TEST(TestRtsanInterceptors, PthreadJoinDiesWhenRealtime) {
+  pthread_t thread{};
+  ASSERT_EQ(0,
+            pthread_create(&thread, nullptr, &FakeThreadEntryPoint, nullptr));
+
+  auto Func = [&thread]() { pthread_join(thread, nullptr); };
 
   ExpectRealtimeDeath(Func, "pthread_join");
   ExpectNonRealtimeSurvival(Func);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h b/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h
index 6ca09cf657094..4ba4fc5e53086 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h
@@ -30,9 +30,10 @@ void ExpectRealtimeDeath(Function &&Func,
 
   auto GetExpectedErrorSubstring = [&]() -> std::string {
     return intercepted_method_name != nullptr
-               ? "Real-time violation: intercepted call to real-time unsafe "
-                 "function `" +
-                     std::string(intercepted_method_name) + "`"
+               ? ".*==ERROR: RealtimeSanitizer: unsafe-library-call.*"
+                 "Intercepted call to real-time unsafe function `" +
+                     std::string(intercepted_method_name) +
+                     "` in real-time context!"
                : "";
   };
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
index f8f03454ea169..9208b12552ff5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
@@ -139,8 +139,14 @@
 namespace __sanitizer {
 
 #if defined(__UINTPTR_TYPE__)
+#  if defined(__arm__) && defined(__linux__)
+// Linux Arm headers redefine __UINTPTR_TYPE__ and disagree with clang/gcc.
+typedef unsigned int uptr;
+typedef int sptr;
+#  else
 typedef __UINTPTR_TYPE__ uptr;
 typedef __INTPTR_TYPE__ sptr;
+#  endif
 #elif defined(_WIN64)
 // 64-bit Windows uses LLP64 data model.
 typedef unsigned long long uptr;
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 1a232b9b9fb2d..2fae29e5a2168 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -72,13 +72,16 @@ namespace {
 struct CachedBlock {
   static constexpr u16 CacheIndexMax = UINT16_MAX;
   static constexpr u16 InvalidEntry = CacheIndexMax;
-  //   * MaxReleasedCachePages default is currently 4
-  //        - We arrived at this value after noticing that mapping
-  //        in larger memory regions performs better than releasing
-  //        memory and forcing a cache hit. According to the data,
-  //        it suggests that beyond 4 pages, the release execution time is
-  //        longer than the map execution time. In this way, the default
-  //        is dependent on the platform.
+  // We allow a certain amount of fragmentation and part of the fragmented bytes
+  // will be released by `releaseAndZeroPagesToOS()`. This increases the chance
+  // of cache hit rate and reduces the overhead to the RSS at the same time. See
+  // more details in the `MapAllocatorCache::retrieve()` section.
+  //
+  // We arrived at this default value after noticing that mapping in larger
+  // memory regions performs better than releasing memory and forcing a cache
+  // hit. According to the data, it suggests that beyond 4 pages, the release
+  // execution time is longer than the map execution time. In this way,
+  // the default is dependent on the platform.
   static constexpr uptr MaxReleasedCachePages = 4U;
 
   uptr CommitBase = 0;
@@ -725,8 +728,14 @@ MapAllocator<Config>::tryAllocateFromCache(const Options &Options, uptr Size,
   uptr EntryHeaderPos;
   uptr MaxAllowedFragmentedPages = MaxUnreleasedCachePages;
 
-  if (UNLIKELY(useMemoryTagging<Config>(Options)))
+  if (LIKELY(!useMemoryTagging<Config>(Options))) {
     MaxAllowedFragmentedPages += CachedBlock::MaxReleasedCachePages;
+  } else {
+    // TODO: Enable MaxReleasedCachePages may result in pages for an entry being
+    // partially released and it erases the tag of those pages as well. To
+    // support this feature for MTE, we need to tag those pages again.
+    DCHECK_EQ(MaxAllowedFragmentedPages, MaxUnreleasedCachePages);
+  }
 
   Entry = Cache.retrieve(MaxAllowedFragmentedPages, Size, Alignment,
                          getHeadersSize(), EntryHeaderPos);
diff --git a/compiler-rt/test/rtsan/basic.cpp b/compiler-rt/test/rtsan/basic.cpp
index f4075bb27e4f9..607db90213a30 100644
--- a/compiler-rt/test/rtsan/basic.cpp
+++ b/compiler-rt/test/rtsan/basic.cpp
@@ -17,6 +17,7 @@ void violation() [[clang::nonblocking]] {
 int main() {
   violation();
   return 0;
-  // CHECK: Real-time violation: intercepted call to real-time unsafe function `malloc` in real-time context! Stack trace:
+  // CHECK: ==ERROR: RealtimeSanitizer: unsafe-library-call
+  // CHECK-NEXT: Intercepted call to real-time unsafe function `malloc` in real-time context!
   // CHECK-NEXT: {{.*malloc*}}
 }
diff --git a/compiler-rt/test/rtsan/disabler.cpp b/compiler-rt/test/rtsan/disabler.cpp
index 0a6411a2be694..dd1d4439beae4 100644
--- a/compiler-rt/test/rtsan/disabler.cpp
+++ b/compiler-rt/test/rtsan/disabler.cpp
@@ -41,7 +41,7 @@ int main() {
   // CHECK: Allocated pointer {{.*}} in disabled context
   // CHECK: Allocated second pointer {{.*}} in disabled context
   // CHECK: Free'd second pointer in disabled context
-  // CHECK: {{.*Real-time violation.*}}
+  // CHECK: ==ERROR: RealtimeSanitizer: unsafe-library-call
   // CHECK-NOT: {{.*malloc*}}
   // CHECK-NEXT: {{.*free.*}}
 }
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index fb57744c21570..ed1ef49f8b77a 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -386,6 +386,9 @@ end
   probably by a C or C++ external definition.
 * An automatic data object may be declared in the specification part
   of the main program.
+* A local data object may appear in a specification expression, even
+  when it is not a dummy argument or in COMMON, so long as it is
+  has the SAVE attribute and was initialized.
 
 ### Extensions supported when enabled by options
 
@@ -507,10 +510,7 @@ end
   f18 supports them with a portability warning.
 * f18 does not enforce a blanket prohibition against generic
   interfaces containing a mixture of functions and subroutines.
-  Apart from some contexts in which the standard requires all of
-  a particular generic interface to have only all functions or
-  all subroutines as its specific procedures, we allow both to
-  appear, unlike several other Fortran compilers.
+  We allow both to appear, unlike several other Fortran compilers.
   This is especially desirable when two generics of the same
   name are combined due to USE association and the mixture may
   be inadvertent.
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 0c8a3d2bd5281..86c6e02b0f2ff 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -51,7 +51,8 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize,
     NonBindCInteroperability, CudaManaged, CudaUnified,
     PolymorphicActualAllocatableOrPointerToMonomorphicDummy, RelaxedPureDummy,
-    UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram, PrintCptr)
+    UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram, PrintCptr,
+    SavedLocalInSpecExpr)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
@@ -146,6 +147,8 @@ class LanguageFeatureControl {
     warnUsage_.set(UsageWarning::VectorSubscriptFinalization);
     warnUsage_.set(UsageWarning::UndefinedFunctionResult);
     warnUsage_.set(UsageWarning::UselessIomsg);
+    // New warnings, on by default
+    warnLanguage_.set(LanguageFeature::SavedLocalInSpecExpr);
   }
   LanguageFeatureControl(const LanguageFeatureControl &) = default;
 
diff --git a/flang/include/flang/Evaluate/expression.h b/flang/include/flang/Evaluate/expression.h
index 3ba46edba717b..2a40193e32306 100644
--- a/flang/include/flang/Evaluate/expression.h
+++ b/flang/include/flang/Evaluate/expression.h
@@ -342,6 +342,7 @@ template <typename A> struct Extremum : public Operation<Extremum<A>, A, A, A> {
       : Base{x, y}, ordering{ord} {}
   Extremum(Ordering ord, Expr<Operand> &&x, Expr<Operand> &&y)
       : Base{std::move(x), std::move(y)}, ordering{ord} {}
+  bool operator==(const Extremum &) const;
   Ordering ordering{Ordering::Greater};
 };
 
@@ -381,6 +382,7 @@ struct LogicalOperation
       : Base{x, y}, logicalOperator{opr} {}
   LogicalOperation(LogicalOperator opr, Expr<Operand> &&x, Expr<Operand> &&y)
       : Base{std::move(x), std::move(y)}, logicalOperator{opr} {}
+  bool operator==(const LogicalOperation &) const;
   LogicalOperator logicalOperator;
 };
 
@@ -634,6 +636,7 @@ class Relational : public Operation<Relational<T>, LogicalResult, T, T> {
       : Base{a, b}, opr{r} {}
   Relational(RelationalOperator r, Expr<Operand> &&a, Expr<Operand> &&b)
       : Base{std::move(a), std::move(b)}, opr{r} {}
+  bool operator==(const Relational &) const;
   RelationalOperator opr;
 };
 
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 3675d9f924876..a0487e399d936 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -218,6 +218,22 @@ template <typename A, typename B> A *UnwrapExpr(std::optional<B> &x) {
   }
 }
 
+template <typename A, typename B> const A *UnwrapExpr(const B *x) {
+  if (x) {
+    return UnwrapExpr<A>(*x);
+  } else {
+    return nullptr;
+  }
+}
+
+template <typename A, typename B> A *UnwrapExpr(B *x) {
+  if (x) {
+    return UnwrapExpr<A>(*x);
+  } else {
+    return nullptr;
+  }
+}
+
 // A variant of UnwrapExpr above that also skips through (parentheses)
 // and conversions of kinds within a category.  Useful for extracting LEN
 // type parameter inquiries, at least.
diff --git a/flang/include/flang/Semantics/scope.h b/flang/include/flang/Semantics/scope.h
index a58163f5460c2..e73a507e9b3f5 100644
--- a/flang/include/flang/Semantics/scope.h
+++ b/flang/include/flang/Semantics/scope.h
@@ -138,6 +138,8 @@ class Scope {
   const_iterator cend() const { return symbols_.cend(); }
 
   // Return symbols in declaration order (the iterators above are in name order)
+  // When a generic procedure interface shadows a derived type or specific
+  // procedure, only the generic's symbol appears in the output.
   SymbolVector GetSymbols() const;
   MutableSymbolVector GetSymbols();
 
diff --git a/flang/include/flang/Semantics/type.h b/flang/include/flang/Semantics/type.h
index 04f8b11e992a0..e2d47d38f927f 100644
--- a/flang/include/flang/Semantics/type.h
+++ b/flang/include/flang/Semantics/type.h
@@ -459,8 +459,5 @@ inline const DerivedTypeSpec *DeclTypeSpec::AsDerived() const {
   return const_cast<DeclTypeSpec *>(this)->AsDerived();
 }
 
-std::optional<bool> IsInteroperableIntrinsicType(
-    const DeclTypeSpec &, const common::LanguageFeatureControl &);
-
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_TYPE_H_
diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index fef4620857a08..a1ede7d7553bf 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -525,6 +525,11 @@ class CheckSpecificationExprHelper
 
   Result operator()(const semantics::Symbol &symbol) const {
     const auto &ultimate{symbol.GetUltimate()};
+    const auto *object{ultimate.detailsIf<semantics::ObjectEntityDetails>()};
+    bool isInitialized{semantics::IsSaved(ultimate) &&
+        !IsAllocatable(ultimate) && object &&
+        (ultimate.test(Symbol::Flag::InDataStmt) ||
+            object->init().has_value())};
     if (const auto *assoc{
             ultimate.detailsIf<semantics::AssocEntityDetails>()}) {
       return (*this)(assoc->expr());
@@ -554,6 +559,17 @@ class CheckSpecificationExprHelper
       }
     } else if (&symbol.owner() != &scope_ || &ultimate.owner() != &scope_) {
       return std::nullopt; // host association is in play
+    } else if (isInitialized &&
+        context_.languageFeatures().IsEnabled(
+            common::LanguageFeature::SavedLocalInSpecExpr)) {
+      if (!scope_.IsModuleFile() &&
+          context_.languageFeatures().ShouldWarn(
+              common::LanguageFeature::SavedLocalInSpecExpr)) {
+        context_.messages().Say(
+            "specification expression refers to local object '%s' (initialized and saved)"_port_en_US,
+            ultimate.name().ToString());
+      }
+      return std::nullopt;
     } else if (const auto *object{
                    ultimate.detailsIf<semantics::ObjectEntityDetails>()}) {
       if (object->commonBlock()) {
@@ -781,8 +797,9 @@ bool CheckSpecificationExprHelper::IsPermissibleInquiry(
 template <typename A>
 void CheckSpecificationExpr(const A &x, const semantics::Scope &scope,
     FoldingContext &context, bool forElementalFunctionResult) {
-  if (auto why{CheckSpecificationExprHelper{
-          scope, context, forElementalFunctionResult}(x)}) {
+  CheckSpecificationExprHelper helper{
+      scope, context, forElementalFunctionResult};
+  if (auto why{helper(x)}) {
     context.messages().Say("Invalid specification expression%s: %s"_err_en_US,
         forElementalFunctionResult ? " for elemental function result" : "",
         *why);
diff --git a/flang/lib/Evaluate/expression.cpp b/flang/lib/Evaluate/expression.cpp
index 5b0bc14dc3e1b..1a65d4c7362fe 100644
--- a/flang/lib/Evaluate/expression.cpp
+++ b/flang/lib/Evaluate/expression.cpp
@@ -125,6 +125,24 @@ template <typename A> LLVM_DUMP_METHOD void ExpressionBase<A>::dump() const {
 
 // Equality testing
 
+template <typename A> bool Extremum<A>::operator==(const Extremum &that) const {
+  return ordering == that.ordering && Base::operator==(that);
+}
+
+template <int KIND>
+bool LogicalOperation<KIND>::operator==(const LogicalOperation &that) const {
+  return logicalOperator == that.logicalOperator && Base::operator==(that);
+}
+
+template <typename A>
+bool Relational<A>::operator==(const Relational &that) const {
+  return opr == that.opr && Base::operator==(that);
+}
+
+bool Relational<SomeType>::operator==(const Relational &that) const {
+  return u == that.u;
+}
+
 bool ImpliedDoIndex::operator==(const ImpliedDoIndex &that) const {
   return name == that.name;
 }
@@ -181,10 +199,6 @@ bool StructureConstructor::operator==(const StructureConstructor &that) const {
   return result_ == that.result_ && values_ == that.values_;
 }
 
-bool Relational<SomeType>::operator==(const Relational<SomeType> &that) const {
-  return u == that.u;
-}
-
 template <int KIND>
 bool Expr<Type<TypeCategory::Integer, KIND>>::operator==(
     const Expr<Type<TypeCategory::Integer, KIND>> &that) const {
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 9ce0edbdcb779..1b14a305b87f4 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -1088,24 +1088,42 @@ Expr<T> FoldMINorMAX(
   static_assert(T::category == TypeCategory::Integer ||
       T::category == TypeCategory::Real ||
       T::category == TypeCategory::Character);
-  std::vector<Constant<T> *> constantArgs;
-  // Call Folding on all arguments, even if some are not constant,
-  // to make operand promotion explicit.
-  for (auto &arg : funcRef.arguments()) {
-    if (auto *cst{Folder<T>{context}.Folding(arg)}) {
-      constantArgs.push_back(cst);
+  auto &args{funcRef.arguments()};
+  bool ok{true};
+  std::optional<Expr<T>> result;
+  Folder<T> folder{context};
+  for (std::optional<ActualArgument> &arg : args) {
+    // Call Folding on all arguments to make operand promotion explicit.
+    if (!folder.Folding(arg)) {
+      // TODO: Lowering can't handle having every FunctionRef for max and min
+      // being converted into Extremum<T>.  That needs fixing.  Until that
+      // is corrected, however, it is important that max and min references
+      // in module files be converted into Extremum<T> even when not constant;
+      // the Extremum<SubscriptInteger> operations created to normalize the
+      // values of array bounds are formatted as max operations in the
+      // declarations in modules, and need to be read back in as such in
+      // order for expression comparison to not produce false inequalities
+      // when checking function results for procedure interface compatibility.
+      if (!context.moduleFileName()) {
+        ok = false;
+      }
+    }
+    Expr<SomeType> *argExpr{arg ? arg->UnwrapExpr() : nullptr};
+    if (argExpr) {
+      *argExpr = Fold(context, std::move(*argExpr));
+    }
+    if (Expr<T> * tExpr{UnwrapExpr<Expr<T>>(argExpr)}) {
+      if (result) {
+        result = FoldOperation(
+            context, Extremum<T>{order, std::move(*result), Expr<T>{*tExpr}});
+      } else {
+        result = Expr<T>{*tExpr};
+      }
+    } else {
+      ok = false;
     }
   }
-  if (constantArgs.size() != funcRef.arguments().size()) {
-    return Expr<T>(std::move(funcRef));
-  }
-  CHECK(!constantArgs.empty());
-  Expr<T> result{std::move(*constantArgs[0])};
-  for (std::size_t i{1}; i < constantArgs.size(); ++i) {
-    Extremum<T> extremum{order, result, Expr<T>{std::move(*constantArgs[i])}};
-    result = FoldOperation(context, std::move(extremum));
-  }
-  return result;
+  return ok && result ? std::move(*result) : Expr<T>{std::move(funcRef)};
 }
 
 // For AMAX0, AMIN0, AMAX1, AMIN1, DMAX1, DMIN1, MAX0, MIN0, MAX1, and MIN1
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index ebe946ac60ccb..876c2aed4ffd6 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -587,7 +587,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
     {"izext", {{"i", AnyInt}}, TypePattern{IntType, KindCode::exactKind, 2}},
     {"jzext", {{"i", AnyInt}}, DefaultInt},
     {"kind",
-        {{"x", AnyIntrinsic, Rank::elemental, Optionality::required,
+        {{"x", AnyIntrinsic, Rank::anyOrAssumedRank, Optionality::required,
             common::Intent::In, {ArgFlag::canBeMoldNull}}},
         DefaultInt, Rank::elemental, IntrinsicClass::inquiryFunction},
     {"lbound",
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 6b3db619c1e2f..400f27aef98da 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1011,6 +1011,9 @@ struct CollectCudaSymbolsHelper : public SetTraverse<CollectCudaSymbolsHelper,
   }
   // Overload some of the operator() to filter out the symbols that are not
   // of interest for CUDA data transfer logic.
+  semantics::UnorderedSymbolSet operator()(const DescriptorInquiry &) const {
+    return {};
+  }
   semantics::UnorderedSymbolSet operator()(const Subscript &) const {
     return {};
   }
diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index 5ecc3701b4f24..a1df40667471a 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -820,8 +820,8 @@ std::optional<bool> IsInteroperableIntrinsicType(const DynamicType &type,
     return true;
   case TypeCategory::Real:
   case TypeCategory::Complex:
-    return (features && features->IsEnabled(common::LanguageFeature::CUDA)) ||
-        type.kind() >= 4; // no short or half floats
+    return type.kind() >= 4 /* not a short or half float */ || !features ||
+        features->IsEnabled(common::LanguageFeature::CUDA);
   case TypeCategory::Logical:
     return type.kind() == 1; // C_BOOL
   case TypeCategory::Character:
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 3f54234b176e3..f336d213cc862 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -181,7 +181,7 @@ static void addUseDeviceClause(
 
 static void convertLoopBounds(lower::AbstractConverter &converter,
                               mlir::Location loc,
-                              mlir::omp::LoopRelatedOps &result,
+                              mlir::omp::LoopRelatedClauseOps &result,
                               std::size_t loopVarTypeSize) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   // The types of lower bound, upper bound, and step are converted into the
@@ -203,7 +203,7 @@ static void convertLoopBounds(lower::AbstractConverter &converter,
 
 bool ClauseProcessor::processCollapse(
     mlir::Location currentLocation, lower::pft::Evaluation &eval,
-    mlir::omp::LoopRelatedOps &result,
+    mlir::omp::LoopRelatedClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const {
   bool found = false;
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -855,7 +855,7 @@ bool ClauseProcessor::processIf(
     // Assume that, at most, a single 'if' clause will be applicable to the
     // given directive.
     if (operand) {
-      result.ifVar = operand;
+      result.ifExpr = operand;
       found = true;
     }
   });
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index f6b319c726a2d..8d02d368f4ee0 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -55,7 +55,7 @@ class ClauseProcessor {
   // 'Unique' clauses: They can appear at most once in the clause list.
   bool
   processCollapse(mlir::Location currentLocation, lower::pft::Evaluation &eval,
-                  mlir::omp::LoopRelatedOps &result,
+                  mlir::omp::LoopRelatedClauseOps &result,
                   llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const;
   bool processDevice(lower::StatementContext &stmtCtx,
                      mlir::omp::DeviceClauseOps &result) const;
diff --git a/flang/lib/Lower/PFTBuilder.cpp b/flang/lib/Lower/PFTBuilder.cpp
index 5b3d5471925bf..793e291a168ad 100644
--- a/flang/lib/Lower/PFTBuilder.cpp
+++ b/flang/lib/Lower/PFTBuilder.cpp
@@ -1566,6 +1566,14 @@ struct SymbolDependenceAnalysis {
       return 0;
     LLVM_DEBUG(llvm::dbgs() << "analyze symbol " << &sym << " in <"
                             << &sym.owner() << ">: " << sym << '\n');
+    const semantics::Symbol &ultimate = sym.GetUltimate();
+    if (const auto *details = ultimate.detailsIf<semantics::GenericDetails>()) {
+      // Procedure pointers may be "hidden" behind to the generic symbol if they
+      // have the same name.
+      if (const semantics::Symbol *specific = details->specific())
+        analyze(*specific);
+      return 0;
+    }
     const bool isProcedurePointerOrDummy =
         semantics::IsProcedurePointer(sym) ||
         (semantics::IsProcedure(sym) && IsDummy(sym));
@@ -1582,7 +1590,6 @@ struct SymbolDependenceAnalysis {
     if (sym.owner().IsDerivedType())
       return 0;
 
-    semantics::Symbol ultimate = sym.GetUltimate();
     if (const auto *details =
             ultimate.detailsIf<semantics::NamelistDetails>()) {
       // handle namelist group symbols
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index c5a135a189e8d..d786d79ba8701 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -1594,8 +1594,7 @@ mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
       cPtrCoor = builder.create<fir::ExtractValueOp>(loc, addrFieldTy, cPtr,
                                                      arrayAttr);
     }
-    mlir::Value cptr = builder.create<fir::LoadOp>(loc, cPtrCoor);
-    return genCPtrOrCFunptrValue(builder, loc, cptr);
+    return genCPtrOrCFunptrValue(builder, loc, cPtrCoor);
   }
 
   if (fir::isa_ref_type(cPtr.getType())) {
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 576e65ba6ecc5..46e70d7ef9180 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -72,6 +72,10 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
                     mlir::LLVM::DICompileUnitAttr cuAttr,
                     fir::DebugTypeGenerator &typeGen,
                     mlir::SymbolTable *symbolTable);
+  std::optional<mlir::LLVM::DIModuleAttr>
+  getModuleAttrFromGlobalOp(fir::GlobalOp globalOp,
+                            mlir::LLVM::DIFileAttr fileAttr,
+                            mlir::LLVM::DIScopeAttr scope);
 };
 
 bool debugInfoIsAlreadySet(mlir::Location loc) {
@@ -152,6 +156,45 @@ mlir::LLVM::DIModuleAttr AddDebugInfoPass::getOrCreateModuleAttr(
   return modAttr;
 }
 
+/// If globalOp represents a module variable, return a ModuleAttr that
+/// represents that module.
+std::optional<mlir::LLVM::DIModuleAttr>
+AddDebugInfoPass::getModuleAttrFromGlobalOp(fir::GlobalOp globalOp,
+                                            mlir::LLVM::DIFileAttr fileAttr,
+                                            mlir::LLVM::DIScopeAttr scope) {
+  mlir::MLIRContext *context = &getContext();
+  mlir::OpBuilder builder(context);
+
+  std::pair result = fir::NameUniquer::deconstruct(globalOp.getSymName());
+  // Only look for module if this variable is not part of a function.
+  if (!result.second.procs.empty() || result.second.modules.empty())
+    return std::nullopt;
+
+  // DWARF5 says following about the fortran modules:
+  // A Fortran 90 module may also be represented by a module entry
+  // (but no declaration attribute is warranted because Fortran has no concept
+  // of a corresponding module body).
+  // But in practice, compilers use declaration attribute with a module in cases
+  // where module was defined in another source file (only being used in this
+  // one). The isInitialized() seems to provide the right information
+  // but inverted. It is true where module is actually defined but false where
+  // it is used.
+  // FIXME: Currently we don't have the line number on which a module was
+  // declared. We are using a best guess of line - 1 where line is the source
+  // line of the first member of the module that we encounter.
+  unsigned line = getLineFromLoc(globalOp.getLoc());
+
+  mlir::LLVM::DISubprogramAttr sp =
+      mlir::dyn_cast_if_present<mlir::LLVM::DISubprogramAttr>(scope);
+  // Modules are generated at compile unit scope
+  if (sp)
+    scope = sp.getCompileUnit();
+
+  return getOrCreateModuleAttr(result.second.modules[0], fileAttr, scope,
+                               std::max(line - 1, (unsigned)1),
+                               !globalOp.isInitialized());
+}
+
 void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp,
                                       mlir::LLVM::DIFileAttr fileAttr,
                                       mlir::LLVM::DIScopeAttr scope,
@@ -174,33 +217,11 @@ void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp,
     return;
 
   unsigned line = getLineFromLoc(globalOp.getLoc());
+  std::optional<mlir::LLVM::DIModuleAttr> modOpt =
+      getModuleAttrFromGlobalOp(globalOp, fileAttr, scope);
+  if (modOpt)
+    scope = *modOpt;
 
-  // DWARF5 says following about the fortran modules:
-  // A Fortran 90 module may also be represented by a module entry
-  // (but no declaration attribute is warranted because Fortran has no concept
-  // of a corresponding module body).
-  // But in practice, compilers use declaration attribute with a module in cases
-  // where module was defined in another source file (only being used in this
-  // one). The isInitialized() seems to provide the right information
-  // but inverted. It is true where module is actually defined but false where
-  // it is used.
-  // FIXME: Currently we don't have the line number on which a module was
-  // declared. We are using a best guess of line - 1 where line is the source
-  // line of the first member of the module that we encounter.
-
-  if (result.second.procs.empty()) {
-    // Only look for module if this variable is not part of a function.
-    if (result.second.modules.empty())
-      return;
-
-    // Modules are generated at compile unit scope
-    if (mlir::LLVM::DISubprogramAttr sp =
-            mlir::dyn_cast_if_present<mlir::LLVM::DISubprogramAttr>(scope))
-      scope = sp.getCompileUnit();
-
-    scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, scope,
-                                  line - 1, !globalOp.isInitialized());
-  }
   mlir::LLVM::DITypeAttr diType =
       typeGen.convertType(globalOp.getType(), fileAttr, scope, declOp);
   auto gvAttr = mlir::LLVM::DIGlobalVariableAttr::get(
@@ -262,7 +283,7 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
       mlir::LLVM::DIFileAttr::get(context, fileName, filePath);
 
   // Only definitions need a distinct identifier and a compilation unit.
-  mlir::DistinctAttr id;
+  mlir::DistinctAttr id, id2;
   mlir::LLVM::DIScopeAttr Scope = fileAttr;
   mlir::LLVM::DICompileUnitAttr compilationUnit;
   mlir::LLVM::DISubprogramFlags subprogramFlags =
@@ -270,7 +291,10 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
   if (isOptimized)
     subprogramFlags = mlir::LLVM::DISubprogramFlags::Optimized;
   if (!funcOp.isExternal()) {
+    // Place holder and final function have to have different IDs, otherwise
+    // translation code will reject one of them.
     id = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
+    id2 = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
     compilationUnit = cuAttr;
     subprogramFlags =
         subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition;
@@ -299,14 +323,69 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
                                   line - 1, false);
   }
 
-  auto spAttr = mlir::LLVM::DISubprogramAttr::get(
-      context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr,
-      line, line, subprogramFlags, subTypeAttr, /*retainedNodes=*/{});
-  funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
-
   // Don't process variables if user asked for line tables only.
-  if (debugLevel == mlir::LLVM::DIEmissionKind::LineTablesOnly)
+  if (debugLevel == mlir::LLVM::DIEmissionKind::LineTablesOnly) {
+    auto spAttr = mlir::LLVM::DISubprogramAttr::get(
+        context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr,
+        line, line, subprogramFlags, subTypeAttr, /*retainedNodes=*/{});
+    funcOp->setLoc(builder.getFusedLoc({l}, spAttr));
     return;
+  }
+
+  mlir::DistinctAttr recId =
+      mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
+
+  // The debug attribute in MLIR are readonly once created. But in case of
+  // imported entities, we have a circular dependency. The
+  // DIImportedEntityAttr requires scope information (DISubprogramAttr in this
+  // case) and DISubprogramAttr requires the list of imported entities. The
+  // MLIR provides a way where a DISubprogramAttr an be created with a certain
+  // recID and be used in places like DIImportedEntityAttr. After that another
+  // DISubprogramAttr can be created with same recID but with list of entities
+  // now available. The MLIR translation code takes care of updating the
+  // references. Note that references will be updated only in the things that
+  // are part of DISubprogramAttr (like DIImportedEntityAttr) so we have to
+  // create the final DISubprogramAttr before we process local variables.
+  // Look at DIRecursiveTypeAttrInterface for more details.
+
+  auto spAttr = mlir::LLVM::DISubprogramAttr::get(
+      context, recId, /*isRecSelf=*/true, id, compilationUnit, Scope, funcName,
+      fullName, funcFileAttr, line, line, subprogramFlags, subTypeAttr,
+      /*retainedNodes=*/{});
+
+  // There is no direct information in the IR for any 'use' statement in the
+  // function. We have to extract that information from the DeclareOp. We do
+  // a pass on the DeclareOp and generate ModuleAttr and corresponding
+  // DIImportedEntityAttr for that module.
+  // FIXME: As we are depending on the variables to see which module is being
+  // 'used' in the function, there are certain limitations.
+  // For things like 'use mod1, only: v1', whole module will be brought into the
+  // namespace in the debug info. It is not a problem as such unless there is a
+  // clash of names.
+  // There is no information about module variable renaming
+  llvm::DenseSet<mlir::LLVM::DIImportedEntityAttr> importedModules;
+  funcOp.walk([&](fir::cg::XDeclareOp declOp) {
+    if (&funcOp.front() == declOp->getBlock())
+      if (auto global =
+              symbolTable->lookup<fir::GlobalOp>(declOp.getUniqName())) {
+        std::optional<mlir::LLVM::DIModuleAttr> modOpt =
+            getModuleAttrFromGlobalOp(global, fileAttr, cuAttr);
+        if (modOpt) {
+          auto importedEntity = mlir::LLVM::DIImportedEntityAttr::get(
+              context, llvm::dwarf::DW_TAG_imported_module, spAttr, *modOpt,
+              fileAttr, /*line=*/1, /*name=*/nullptr, /*elements*/ {});
+          importedModules.insert(importedEntity);
+        }
+      }
+  });
+  llvm::SmallVector<mlir::LLVM::DINodeAttr> entities(importedModules.begin(),
+                                                     importedModules.end());
+  // We have the imported entities now. Generate the final DISubprogramAttr.
+  spAttr = mlir::LLVM::DISubprogramAttr::get(
+      context, recId, /*isRecSelf=*/false, id2, compilationUnit, Scope,
+      funcName, fullName, funcFileAttr, line, line, subprogramFlags,
+      subTypeAttr, entities);
+  funcOp->setLoc(builder.getFusedLoc({l}, spAttr));
 
   funcOp.walk([&](fir::cg::XDeclareOp declOp) {
     // FIXME: We currently dont handle variables that are not in the entry
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 734c34276b13b..c896ee7d29381 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -3003,17 +3003,17 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
           } else {
             msgs.Annex(std::move(bad));
           }
-        } else if (!IsInteroperableIntrinsicType(
-                       *type, context_.languageFeatures())
+        } else if (auto dyType{evaluate::DynamicType::From(*type)}; dyType &&
+                   !evaluate::IsInteroperableIntrinsicType(
+                       *dyType, &context_.languageFeatures())
                         .value_or(false)) {
-          auto maybeDyType{evaluate::DynamicType::From(*type)};
           if (type->category() == DeclTypeSpec::Logical) {
             if (context_.ShouldWarn(common::UsageWarning::LogicalVsCBool)) {
               msgs.Say(component.name(),
                   "A LOGICAL component of an interoperable type should have the interoperable KIND=C_BOOL"_port_en_US);
             }
-          } else if (type->category() == DeclTypeSpec::Character &&
-              maybeDyType && maybeDyType->kind() == 1) {
+          } else if (type->category() == DeclTypeSpec::Character && dyType &&
+              dyType->kind() == 1) {
             if (context_.ShouldWarn(common::UsageWarning::BindCCharLength)) {
               msgs.Say(component.name(),
                   "A CHARACTER component of an interoperable type should have length 1"_port_en_US);
@@ -3106,10 +3106,15 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(const Symbol &symbol) {
         type->category() == DeclTypeSpec::Character &&
         type->characterTypeSpec().length().isDeferred()) {
       // ok; F'2023 18.3.7 p2(6)
-    } else if (derived ||
-        IsInteroperableIntrinsicType(*type, context_.languageFeatures())
-            .value_or(false)) {
+    } else if (derived) { // type has been checked
+    } else if (auto dyType{evaluate::DynamicType::From(*type)}; dyType &&
+               evaluate::IsInteroperableIntrinsicType(*dyType,
+                   InModuleFile() ? nullptr : &context_.languageFeatures())
+                   .value_or(false)) {
       // F'2023 18.3.7 p2(4,5)
+      // N.B. Language features are not passed to IsInteroperableIntrinsicType
+      // when processing a module file, since the module file might have been
+      // compiled with CUDA while the client is not.
     } else if (type->category() == DeclTypeSpec::Logical) {
       if (context_.ShouldWarn(common::UsageWarning::LogicalVsCBool) &&
           !InModuleFile()) {
diff --git a/flang/lib/Semantics/compute-offsets.cpp b/flang/lib/Semantics/compute-offsets.cpp
index d9a9576e9d676..b5a58ddca0ecd 100644
--- a/flang/lib/Semantics/compute-offsets.cpp
+++ b/flang/lib/Semantics/compute-offsets.cpp
@@ -114,6 +114,13 @@ void ComputeOffsetsHelper::Compute(Scope &scope) {
         dependents_.find(symbol) == dependents_.end() &&
         equivalenceBlock_.find(symbol) == equivalenceBlock_.end()) {
       DoSymbol(*symbol);
+      if (auto *generic{symbol->detailsIf<GenericDetails>()}) {
+        if (Symbol * specific{generic->specific()};
+            specific && !FindCommonBlockContaining(*specific)) {
+          // might be a shadowed procedure pointer
+          DoSymbol(*specific);
+        }
+      }
     }
   }
   // Ensure that the size is a multiple of the alignment
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 3684839c187e6..943512f75d7eb 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -210,7 +210,8 @@ class ArgumentAnalyzer {
 // or procedure pointer reference in a ProcedureDesignator.
 MaybeExpr ExpressionAnalyzer::Designate(DataRef &&ref) {
   const Symbol &last{ref.GetLastSymbol()};
-  const Symbol &symbol{BypassGeneric(last).GetUltimate()};
+  const Symbol &specific{BypassGeneric(last)};
+  const Symbol &symbol{specific.GetUltimate()};
   if (semantics::IsProcedure(symbol)) {
     if (symbol.attrs().test(semantics::Attr::ABSTRACT)) {
       Say("Abstract procedure interface '%s' may not be used as a designator"_err_en_US,
@@ -226,6 +227,10 @@ MaybeExpr ExpressionAnalyzer::Designate(DataRef &&ref) {
     } else if (!symbol.attrs().test(semantics::Attr::INTRINSIC)) {
       if (symbol.has<semantics::GenericDetails>()) {
         Say("'%s' is not a specific procedure"_err_en_US, last.name());
+      } else if (IsProcedurePointer(specific)) {
+        // For procedure pointers, retain associations so that data accesses
+        // from client modules will work.
+        return Expr<SomeType>{ProcedureDesignator{specific}};
       } else {
         return Expr<SomeType>{ProcedureDesignator{symbol}};
       }
@@ -1956,7 +1961,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::ArrayConstructor &array) {
 
 // Check if implicit conversion of expr to the symbol type is legal (if needed),
 // and make it explicit if requested.
-static MaybeExpr implicitConvertTo(const semantics::Symbol &sym,
+static MaybeExpr ImplicitConvertTo(const semantics::Symbol &sym,
     Expr<SomeType> &&expr, bool keepConvertImplicit) {
   if (!keepConvertImplicit) {
     return ConvertToType(sym, std::move(expr));
@@ -2196,7 +2201,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(
         // convert would cause a segfault. Lowering will deal with
         // conditionally converting and preserving the lower bounds in this
         // case.
-        if (MaybeExpr converted{implicitConvertTo(
+        if (MaybeExpr converted{ImplicitConvertTo(
                 *symbol, std::move(*value), IsAllocatable(*symbol))}) {
           if (auto componentShape{GetShape(GetFoldingContext(), *symbol)}) {
             if (auto valueShape{GetShape(GetFoldingContext(), *converted)}) {
@@ -4605,7 +4610,8 @@ std::optional<ProcedureRef> ArgumentAnalyzer::GetDefinedAssignmentProc() {
     }
     for (std::size_t i{0}; !proc && i < actuals_.size(); ++i) {
       const Symbol *generic{nullptr};
-      if (const Symbol *binding{FindBoundOp(oprName, i, generic, true)}) {
+      if (const Symbol *
+          binding{FindBoundOp(oprName, i, generic, /*isSubroutine=*/true)}) {
         if (CheckAccessibleSymbol(scope, DEREF(generic))) {
           // ignore inaccessible type-bound ASSIGNMENT(=) generic
         } else if (const Symbol *
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 2e86e0afc9bd0..d8f601212d8d0 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -618,6 +618,20 @@ class ScopeHandler : public ImplicitRulesVisitor {
           return *derivedType;
         }
       }
+    } else if constexpr (std::is_same_v<ProcEntityDetails, D>) {
+      if (auto *d{symbol->detailsIf<GenericDetails>()}) {
+        if (!d->derivedType()) {
+          // procedure pointer with same name as a generic
+          auto *specific{d->specific()};
+          if (!specific) {
+            specific = &currScope().MakeSymbol(name, attrs, std::move(details));
+            d->set_specific(*specific);
+          } else {
+            SayAlreadyDeclared(name, *specific);
+          }
+          return *specific;
+        }
+      }
     }
     if (symbol->CanReplaceDetails(details)) {
       // update the existing symbol
@@ -3035,14 +3049,26 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
     return;
   }
   const Symbol &useUltimate{useSymbol.GetUltimate()};
+  const auto *useGeneric{useUltimate.detailsIf<GenericDetails>()};
   if (localSymbol->has<UnknownDetails>()) {
-    localSymbol->set_details(UseDetails{localName, useSymbol});
-    localSymbol->attrs() =
-        useSymbol.attrs() & ~Attrs{Attr::PUBLIC, Attr::PRIVATE, Attr::SAVE};
-    localSymbol->implicitAttrs() =
-        localSymbol->attrs() & Attrs{Attr::ASYNCHRONOUS, Attr::VOLATILE};
-    localSymbol->flags() = useSymbol.flags();
-    return;
+    if (useGeneric && useGeneric->specific() &&
+        IsProcedurePointer(*useGeneric->specific())) {
+      // We are use-associating a generic that shadows a procedure pointer.
+      // Local references that might be made to that procedure pointer should
+      // use a UseDetails symbol for proper data addressing.  So create an
+      // empty local generic now into which the use-associated generic may
+      // be copied.
+      localSymbol->set_details(GenericDetails{});
+      localSymbol->get<GenericDetails>().set_kind(useGeneric->kind());
+    } else { // just create UseDetails
+      localSymbol->set_details(UseDetails{localName, useSymbol});
+      localSymbol->attrs() =
+          useSymbol.attrs() & ~Attrs{Attr::PUBLIC, Attr::PRIVATE, Attr::SAVE};
+      localSymbol->implicitAttrs() =
+          localSymbol->attrs() & Attrs{Attr::ASYNCHRONOUS, Attr::VOLATILE};
+      localSymbol->flags() = useSymbol.flags();
+      return;
+    }
   }
 
   Symbol &localUltimate{localSymbol->GetUltimate()};
@@ -3066,10 +3092,7 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
   //   - anything other than a derived type, non-generic procedure, or
   //     generic procedure being combined with something other than an
   //     prior USE association of itself
-
   auto *localGeneric{localUltimate.detailsIf<GenericDetails>()};
-  const auto *useGeneric{useUltimate.detailsIf<GenericDetails>()};
-
   Symbol *localDerivedType{nullptr};
   if (localUltimate.has<DerivedTypeDetails>()) {
     localDerivedType = &localUltimate;
@@ -3261,6 +3284,15 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
   // At this point, there must be at least one generic interface.
   CHECK(localGeneric || (useGeneric && (localDerivedType || localProcedure)));
 
+  // Ensure that a use-associated specific procedure that is a procedure
+  // pointer is properly represented as a USE association of an entity.
+  if (IsProcedurePointer(useProcedure)) {
+    Symbol &combined{currScope().MakeSymbol(localSymbol->name(),
+        useProcedure->attrs(), UseDetails{localName, *useProcedure})};
+    combined.flags() |= useProcedure->flags();
+    combinedProcedure = &combined;
+  }
+
   if (localGeneric) {
     // Create a local copy of a previously use-associated generic so that
     // it can be locally extended without corrupting the original.
@@ -3639,36 +3671,36 @@ void InterfaceVisitor::CheckGenericProcedures(Symbol &generic) {
     }
     return;
   }
-  const Symbol &firstSpecific{specifics.front()};
-  bool isFunction{firstSpecific.test(Symbol::Flag::Function)};
-  bool isBoth{false};
+  const Symbol *function{nullptr};
+  const Symbol *subroutine{nullptr};
   for (const Symbol &specific : specifics) {
-    if (isFunction != specific.test(Symbol::Flag::Function)) { // C1514
-      if (context().ShouldWarn(
+    if (!function && specific.test(Symbol::Flag::Function)) {
+      function = &specific;
+    } else if (!subroutine && specific.test(Symbol::Flag::Subroutine)) {
+      subroutine = &specific;
+      if (details.derivedType() &&
+          context().ShouldWarn(
               common::LanguageFeature::SubroutineAndFunctionSpecifics)) {
+        SayDerivedType(generic.name(),
+            "Generic interface '%s' should only contain functions due to derived type with same name"_warn_en_US,
+            *details.derivedType()->GetUltimate().scope());
+      }
+    }
+    if (function && subroutine) {
+      if (context().ShouldWarn(common::LanguageFeature::
+                  SubroutineAndFunctionSpecifics)) { // C1514
         auto &msg{Say(generic.name(),
             "Generic interface '%s' has both a function and a subroutine"_warn_en_US)};
-        if (isFunction) {
-          msg.Attach(firstSpecific.name(), "Function declaration"_en_US);
-          msg.Attach(specific.name(), "Subroutine declaration"_en_US);
-        } else {
-          msg.Attach(firstSpecific.name(), "Subroutine declaration"_en_US);
-          msg.Attach(specific.name(), "Function declaration"_en_US);
-        }
+        msg.Attach(function->name(), "Function declaration"_en_US);
+        msg.Attach(subroutine->name(), "Subroutine declaration"_en_US);
       }
-      isFunction = false;
-      isBoth = true;
       break;
     }
   }
-  if (!isFunction && details.derivedType()) {
-    SayDerivedType(generic.name(),
-        "Generic interface '%s' may only contain functions due to derived type"
-        " with same name"_err_en_US,
-        *details.derivedType()->GetUltimate().scope());
-  }
-  if (!isBoth) {
-    generic.set(isFunction ? Symbol::Flag::Function : Symbol::Flag::Subroutine);
+  if (function && !subroutine) {
+    generic.set(Symbol::Flag::Function);
+  } else if (subroutine && !function) {
+    generic.set(Symbol::Flag::Subroutine);
   }
 }
 
@@ -5079,7 +5111,22 @@ bool DeclarationVisitor::HasCycle(
 
 Symbol &DeclarationVisitor::DeclareProcEntity(
     const parser::Name &name, Attrs attrs, const Symbol *interface) {
-  Symbol &symbol{DeclareEntity<ProcEntityDetails>(name, attrs)};
+  Symbol *proc{nullptr};
+  if (auto *extant{FindInScope(name)}) {
+    if (auto *d{extant->detailsIf<GenericDetails>()}; d && !d->derivedType()) {
+      // procedure pointer with same name as a generic
+      if (auto *specific{d->specific()}) {
+        SayAlreadyDeclared(name, *specific);
+      } else {
+        // Create the ProcEntityDetails symbol in the scope as the "specific()"
+        // symbol behind an existing GenericDetails symbol of the same name.
+        proc = &Resolve(name,
+            currScope().MakeSymbol(name.source, attrs, ProcEntityDetails{}));
+        d->set_specific(*proc);
+      }
+    }
+  }
+  Symbol &symbol{proc ? *proc : DeclareEntity<ProcEntityDetails>(name, attrs)};
   if (auto *details{symbol.detailsIf<ProcEntityDetails>()}) {
     if (context().HasError(symbol)) {
     } else if (HasCycle(symbol, interface)) {
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 9f3eb5fbe11a1..427a8421aeaf9 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -149,6 +149,10 @@ class RuntimeTableBuilder {
   SomeExpr explicitEnum_; // Value::Genre::Explicit
   SomeExpr lenParameterEnum_; // Value::Genre::LenParameter
   SomeExpr scalarAssignmentEnum_; // SpecialBinding::Which::ScalarAssignment
+  SomeExpr
+      scalarAllocatableAssignmentEnum_; // SpecialBinding::Which::ScalarAllocatableAssignment
+  SomeExpr
+      scalarPointerAssignmentEnum_; // SpecialBinding::Which::ScalarPointerAssignment
   SomeExpr
       elementalAssignmentEnum_; // SpecialBinding::Which::ElementalAssignment
   SomeExpr readFormattedEnum_; // SpecialBinding::Which::ReadFormatted
@@ -174,6 +178,9 @@ RuntimeTableBuilder::RuntimeTableBuilder(
       explicitEnum_{GetEnumValue("explicit")},
       lenParameterEnum_{GetEnumValue("lenparameter")},
       scalarAssignmentEnum_{GetEnumValue("scalarassignment")},
+      scalarAllocatableAssignmentEnum_{
+          GetEnumValue("scalarallocatableassignment")},
+      scalarPointerAssignmentEnum_{GetEnumValue("scalarpointerassignment")},
       elementalAssignmentEnum_{GetEnumValue("elementalassignment")},
       readFormattedEnum_{GetEnumValue("readformatted")},
       readUnformattedEnum_{GetEnumValue("readunformatted")},
@@ -1122,10 +1129,10 @@ void RuntimeTableBuilder::DescribeSpecialProc(
       // Non-type-bound generic INTERFACEs and assignments from distinct
       // types must not be used for component intrinsic assignment.
       CHECK(proc->dummyArguments.size() == 2);
-      const auto t1{
+      const auto &ddo1{
           DEREF(std::get_if<evaluate::characteristics::DummyDataObject>(
-                    &proc->dummyArguments[0].u))
-              .type.type()};
+              &proc->dummyArguments[0].u))};
+      const auto t1{ddo1.type.type()};
       const auto t2{
           DEREF(std::get_if<evaluate::characteristics::DummyDataObject>(
                     &proc->dummyArguments[1].u))
@@ -1137,7 +1144,13 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         return;
       }
       which = proc->IsElemental() ? elementalAssignmentEnum_
-                                  : scalarAssignmentEnum_;
+          : ddo1.attrs.test(
+                evaluate::characteristics::DummyDataObject::Attr::Allocatable)
+          ? scalarAllocatableAssignmentEnum_
+          : ddo1.attrs.test(
+                evaluate::characteristics::DummyDataObject::Attr::Pointer)
+          ? scalarPointerAssignmentEnum_
+          : scalarAssignmentEnum_;
       if (binding && binding->passName() &&
           *binding->passName() == proc->dummyArguments[1].name) {
         argThatMightBeDescriptor = 1;
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index b593bf89b18bc..14d6564664f2c 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -210,8 +210,9 @@ const Symbol *GenericDetails::CheckSpecific() const {
 }
 Symbol *GenericDetails::CheckSpecific() {
   if (specific_ && !specific_->has<UseErrorDetails>()) {
+    const Symbol &ultimate{specific_->GetUltimate()};
     for (const Symbol &proc : specificProcs_) {
-      if (&proc == specific_) {
+      if (&proc.GetUltimate() == &ultimate) {
         return nullptr;
       }
     }
diff --git a/flang/lib/Semantics/type.cpp b/flang/lib/Semantics/type.cpp
index cfaee0b8ba6dc..aa6e8973ebd30 100644
--- a/flang/lib/Semantics/type.cpp
+++ b/flang/lib/Semantics/type.cpp
@@ -893,13 +893,4 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &o, const DeclTypeSpec &x) {
   return o << x.AsFortran();
 }
 
-std::optional<bool> IsInteroperableIntrinsicType(
-    const DeclTypeSpec &type, const common::LanguageFeatureControl &features) {
-  if (auto dyType{evaluate::DynamicType::From(type)}) {
-    return IsInteroperableIntrinsicType(*dyType, &features);
-  } else {
-    return std::nullopt;
-  }
-}
-
 } // namespace Fortran::semantics
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index 5f2273de1e3d1..7dfcfe71fcb32 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -106,11 +106,14 @@
   end type
 
   enum, bind(c) ! SpecialBinding::Which
-    enumerator :: ScalarAssignment = 1, ElementalAssignment = 2
-    enumerator :: ReadFormatted = 3, ReadUnformatted = 4
-    enumerator :: WriteFormatted = 5, WriteUnformatted = 6
-    enumerator :: ElementalFinal = 7, AssumedRankFinal = 8
-    enumerator :: ScalarFinal = 9 ! higher-rank final procedures follow
+    enumerator :: ScalarAssignment = 1
+    enumerator :: ScalarAllocatableAssignment = 2
+    enumerator :: ScalarPointerAssignment = 3
+    enumerator :: ElementalAssignment = 4
+    enumerator :: ReadFormatted = 5, ReadUnformatted = 6
+    enumerator :: WriteFormatted = 7, WriteUnformatted = 8
+    enumerator :: ElementalFinal = 9, AssumedRankFinal = 10
+    enumerator :: ScalarFinal = 11 ! higher-rank final procedures follow
   end enum
 
   type, bind(c) :: SpecialBinding
diff --git a/flang/runtime/assign.cpp b/flang/runtime/assign.cpp
index d558ada51cd21..166cf54778921 100644
--- a/flang/runtime/assign.cpp
+++ b/flang/runtime/assign.cpp
@@ -352,6 +352,17 @@ RT_API_ATTRS static void Assign(
     // the Assign() is invoked recursively for component-per-component
     // assignments.
     if (to.rank() == 0) {
+      if (to.IsAllocatable()) {
+        if (const auto *special{toDerived->FindSpecialBinding(typeInfo::
+                    SpecialBinding::Which::ScalarAllocatableAssignment)}) {
+          return DoScalarDefinedAssignment(to, from, *special);
+        }
+      } else if (to.IsPointer()) {
+        if (const auto *special{toDerived->FindSpecialBinding(
+                typeInfo::SpecialBinding::Which::ScalarPointerAssignment)}) {
+          return DoScalarDefinedAssignment(to, from, *special);
+        }
+      }
       if (const auto *special{toDerived->FindSpecialBinding(
               typeInfo::SpecialBinding::Which::ScalarAssignment)}) {
         return DoScalarDefinedAssignment(to, from, *special);
@@ -417,9 +428,8 @@ RT_API_ATTRS static void Assign(
             StaticDescriptor<maxRank, true, 10 /*?*/> statDesc[2];
             Descriptor &toCompDesc{statDesc[0].descriptor()};
             Descriptor &fromCompDesc{statDesc[1].descriptor()};
-            comp.CreatePointerDescriptor(toCompDesc, to, terminator, toAt);
-            comp.CreatePointerDescriptor(
-                fromCompDesc, from, terminator, fromAt);
+            comp.CreateTargetDescriptor(toCompDesc, to, terminator, toAt);
+            comp.CreateTargetDescriptor(fromCompDesc, from, terminator, fromAt);
             Assign(toCompDesc, fromCompDesc, terminator, nestedFlags);
           } else { // Component has intrinsic type; simply copy raw bytes
             std::size_t componentByteSize{comp.SizeInBytes(to)};
diff --git a/flang/runtime/descriptor-io.h b/flang/runtime/descriptor-io.h
index ff5f683c6da52..66158b4076164 100644
--- a/flang/runtime/descriptor-io.h
+++ b/flang/runtime/descriptor-io.h
@@ -255,7 +255,7 @@ static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io,
     // Create a descriptor for the component
     StaticDescriptor<maxRank, true, 16 /*?*/> statDesc;
     Descriptor &desc{statDesc.descriptor()};
-    component.CreatePointerDescriptor(
+    component.CreateTargetDescriptor(
         desc, origDescriptor, terminator, origSubscripts);
     return DescriptorIO<DIR>(io, desc, table);
   } else {
diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h
index 74254bebe6e7a..46204ca927c13 100644
--- a/flang/runtime/format-implementation.h
+++ b/flang/runtime/format-implementation.h
@@ -443,8 +443,9 @@ RT_API_ATTRS int FormatControl<CONTEXT>::CueUpNextDataEdit(
       if (ch != 'P') { // 1PE5.2 - comma not required (C1302)
         CharType peek{Capitalize(PeekNext())};
         if (peek >= 'A' && peek <= 'Z') {
-          if (ch == 'A' /* anticipate F'202X AT editing */ || ch == 'B' ||
-              ch == 'D' || ch == 'E' || ch == 'R' || ch == 'S' || ch == 'T') {
+          if ((ch == 'A' && peek == 'T' /* anticipate F'202X AT editing */) ||
+              ch == 'B' || ch == 'D' || ch == 'E' || ch == 'R' || ch == 'S' ||
+              ch == 'T') {
             // Assume a two-letter edit descriptor
             next = peek;
             ++offset_;
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index d67d1ec80afce..2e0ca46078ecd 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -194,8 +194,9 @@ class IoStatementState {
       std::size_t &byteCount) {
     auto ch{GetCurrentChar(byteCount)};
     bool inNamelist{mutableModes().inNamelist};
-    while (!ch || *ch == ' ' || *ch == '\t' || (inNamelist && *ch == '!')) {
-      if (ch && (*ch == ' ' || *ch == '\t')) {
+    while (!ch || *ch == ' ' || *ch == '\t' || *ch == '\n' ||
+        (inNamelist && *ch == '!')) {
+      if (ch && (*ch == ' ' || *ch == '\t' || *ch == '\n')) {
         HandleRelativePosition(byteCount);
       } else if (!AdvanceRecord()) {
         return Fortran::common::nullopt;
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index af092de70f781..fe26a0d3a6e89 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -362,7 +362,7 @@ static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc,
             io.HandleRelativePosition(byteCount); // skip over '('
             StaticDescriptor<maxRank, true, 16> staticDesc;
             Descriptor &tmpDesc{staticDesc.descriptor()};
-            comp->CreatePointerDescriptor(tmpDesc, source, handler);
+            comp->CreateTargetDescriptor(tmpDesc, source, handler);
             if (!HandleSubscripts(io, desc, tmpDesc, compName)) {
               return false;
             }
@@ -370,7 +370,7 @@ static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc,
           }
         }
         if (!createdDesc) {
-          comp->CreatePointerDescriptor(desc, source, handler);
+          comp->CreateTargetDescriptor(desc, source, handler);
         }
         if (source.rank() > 0) {
           if (desc.rank() > 0) {
diff --git a/flang/runtime/time-intrinsic.cpp b/flang/runtime/time-intrinsic.cpp
index 7e590eabf3966..e6f6e81c7b50c 100644
--- a/flang/runtime/time-intrinsic.cpp
+++ b/flang/runtime/time-intrinsic.cpp
@@ -490,16 +490,20 @@ void RTNAME(Etime)(const Descriptor *values, const Descriptor *time,
     auto typeCode{values->type().GetCategoryAndKind()};
     // ETIME values argument must have decimal range == 2.
     RUNTIME_CHECK(terminator,
-        values->rank() == 1 && values->GetDimension(0).Extent() == 2 &&
-            typeCode && typeCode->first == Fortran::common::TypeCategory::Real);
+        values->rank() == 1 && typeCode &&
+            typeCode->first == Fortran::common::TypeCategory::Real);
     // Only accept KIND=4 here.
     int kind{typeCode->second};
     RUNTIME_CHECK(terminator, kind == 4);
-
-    ApplyFloatingPointKind<StoreFloatingPointAt, void>(
-        kind, terminator, *values, /* atIndex = */ 0, usrTime);
-    ApplyFloatingPointKind<StoreFloatingPointAt, void>(
-        kind, terminator, *values, /* atIndex = */ 1, sysTime);
+    auto extent{values->GetDimension(0).Extent()};
+    if (extent >= 1) {
+      ApplyFloatingPointKind<StoreFloatingPointAt, void>(
+          kind, terminator, *values, /* atIndex = */ 0, usrTime);
+    }
+    if (extent >= 2) {
+      ApplyFloatingPointKind<StoreFloatingPointAt, void>(
+          kind, terminator, *values, /* atIndex = */ 1, sysTime);
+    }
   }
 
   if (time) {
diff --git a/flang/runtime/type-info.cpp b/flang/runtime/type-info.cpp
index cb18c5669b5ff..531944086c7f7 100644
--- a/flang/runtime/type-info.cpp
+++ b/flang/runtime/type-info.cpp
@@ -134,7 +134,7 @@ RT_API_ATTRS void Component::EstablishDescriptor(Descriptor &descriptor,
   }
 }
 
-RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor,
+RT_API_ATTRS void Component::CreateTargetDescriptor(Descriptor &descriptor,
     const Descriptor &container, Terminator &terminator,
     const SubscriptValue *subscripts) const {
   RUNTIME_CHECK(terminator, genre_ == Genre::Data);
@@ -144,7 +144,6 @@ RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor,
   } else {
     descriptor.set_base_addr(container.OffsetElement<char>() + offset_);
   }
-  descriptor.raw().attribute = CFI_attribute_pointer;
 }
 
 RT_API_ATTRS const DerivedType *DerivedType::GetParentType() const {
@@ -297,6 +296,12 @@ FILE *SpecialBinding::Dump(FILE *f) const {
   case Which::ScalarAssignment:
     std::fputs("    ScalarAssignment", f);
     break;
+  case Which::ScalarAllocatableAssignment:
+    std::fputs("    ScalarAllocatableAssignment", f);
+    break;
+  case Which::ScalarPointerAssignment:
+    std::fputs("    ScalarPointerAssignment", f);
+    break;
   case Which::ElementalAssignment:
     std::fputs("    ElementalAssignment", f);
     break;
diff --git a/flang/runtime/type-info.h b/flang/runtime/type-info.h
index c3f3595e32ef2..340971bfacf3e 100644
--- a/flang/runtime/type-info.h
+++ b/flang/runtime/type-info.h
@@ -89,9 +89,9 @@ class Component {
   RT_API_ATTRS void EstablishDescriptor(
       Descriptor &, const Descriptor &container, Terminator &) const;
 
-  // Creates a pointer descriptor from this component description, possibly
+  // Creates a descriptor from this component description, possibly
   // with subscripts
-  RT_API_ATTRS void CreatePointerDescriptor(Descriptor &,
+  RT_API_ATTRS void CreateTargetDescriptor(Descriptor &,
       const Descriptor &container, Terminator &,
       const SubscriptValue * = nullptr) const;
 
@@ -126,14 +126,16 @@ class SpecialBinding {
   enum class Which : std::uint8_t {
     None = 0,
     ScalarAssignment = 1,
-    ElementalAssignment = 2,
-    ReadFormatted = 3,
-    ReadUnformatted = 4,
-    WriteFormatted = 5,
-    WriteUnformatted = 6,
-    ElementalFinal = 7,
-    AssumedRankFinal = 8,
-    ScalarFinal = 9,
+    ScalarAllocatableAssignment = 2,
+    ScalarPointerAssignment = 3,
+    ElementalAssignment = 4,
+    ReadFormatted = 5,
+    ReadUnformatted = 6,
+    WriteFormatted = 7,
+    WriteUnformatted = 8,
+    ElementalFinal = 9,
+    AssumedRankFinal = 10,
+    ScalarFinal = 11,
     // higher-ranked final procedures follow
   };
 
diff --git a/flang/test/Evaluate/fold-assumed-rank-kind.f90 b/flang/test/Evaluate/fold-assumed-rank-kind.f90
new file mode 100644
index 0000000000000..674f60c6a0e2f
--- /dev/null
+++ b/flang/test/Evaluate/fold-assumed-rank-kind.f90
@@ -0,0 +1,6 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+subroutine subr(ar)
+  real(8) :: ar(..)
+!CHECK:  PRINT *, 8_4
+  print *, kind(ar)
+end
diff --git a/flang/test/Integration/debug-module-2.f90 b/flang/test/Integration/debug-module-2.f90
index 60fccaa2a6c1f..f07416c3ef3cc 100644
--- a/flang/test/Integration/debug-module-2.f90
+++ b/flang/test/Integration/debug-module-2.f90
@@ -17,7 +17,7 @@ module helper
   integer gli
 
   contains
-!CHECK-DAG: !DISubprogram(name: "test", linkageName: "_QMhelperPtest", scope: ![[MOD]], file: ![[FILE2]], line: [[@LINE+1]]{{.*}}unit: ![[CU]])
+!CHECK-DAG: !DISubprogram(name: "test", linkageName: "_QMhelperPtest", scope: ![[MOD]], file: ![[FILE2]], line: [[@LINE+1]]{{.*}}unit: ![[CU]]{{.*}})
     subroutine test()
     glr = 12.34
     gli = 67
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index bcbac9c303142..11f1f33d7cb58 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -353,3 +353,13 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPsub17()
 ! CHECK: cuf.kernel<<<*, *>>>
 ! CHECK-NOT: cuf.data_transfer
+
+subroutine sub18()
+  integer, device, allocatable :: a(:)
+  integer :: isz
+
+  isz = size(a)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub18()
+! CHECK-NOT: cuf.data_transfer
diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf
index 21c5088b640fc..2eac890970d52 100644
--- a/flang/test/Lower/CUDA/cuda-devptr.cuf
+++ b/flang/test/Lower/CUDA/cuda-devptr.cuf
@@ -40,8 +40,9 @@ end
 ! CHECK: %[[X:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK: %[[CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
 ! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}#1, %[[CPTR]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
-! CHECK: %[[CPTR_LOAD:.*]] = fir.load %[[CPTR_COORD]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
-! CHECK: %[[ADDRESS:.*]] = fir.extract_value %[[CPTR_LOAD]], [0 : index] : (!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) -> i64
-! CHECK: %[[ADDRESS_IDX:.*]] = fir.convert %[[ADDRESS]] : (i64) -> !fir.ptr<!fir.array<?xf32>>
+! CHECK: %[[ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[ADDRESS_COORD:.*]] = fir.coordinate_of %[[CPTR_COORD]], %[[ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS_LOADED:.*]] = fir.load %[[ADDRESS_COORD]] : !fir.ref<i64>
+! CHECK: %[[ADDRESS_IDX:.*]] = fir.convert %[[ADDRESS_LOADED]] : (i64) -> !fir.ptr<!fir.array<?xf32>>
 ! CHECK: %[[EMBOX:.*]] = fir.embox %[[ADDRESS_IDX]](%{{.*}}) : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
 ! CHECK: fir.store %[[EMBOX]] to %[[X]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
diff --git a/flang/test/Lower/HLFIR/procedure-pointer-in-generics.f90 b/flang/test/Lower/HLFIR/procedure-pointer-in-generics.f90
new file mode 100644
index 0000000000000..ff447d31b1af1
--- /dev/null
+++ b/flang/test/Lower/HLFIR/procedure-pointer-in-generics.f90
@@ -0,0 +1,46 @@
+! Test procedure pointers with the same name as generics.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+module m_gen
+  procedure(func), pointer :: foo
+  interface foo
+     procedure :: foo
+  end interface
+  interface
+    real function func(x)
+      real :: x
+    end function
+  end interface
+end
+!CHECK-LABEL:   fir.global @_QMm_genEfoo : !fir.boxproc<(!fir.ref<f32>) -> f32> {
+!CHECK:           %[[VAL_0:.*]] = fir.zero_bits (!fir.ref<f32>) -> f32
+!CHECK:           %[[VAL_1:.*]] = fir.emboxproc %[[VAL_0]] : ((!fir.ref<f32>) -> f32) -> !fir.boxproc<(!fir.ref<f32>) -> f32>
+!CHECK:           fir.has_value %[[VAL_1]] : !fir.boxproc<(!fir.ref<f32>) -> f32>
+
+subroutine test1()
+  use m_gen
+  foo => func
+end subroutine
+!CHECK-LABEL:   func.func @_QPtest1() {
+!CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QMm_genEfoo) : !fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>
+!CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}}"_QMm_genEfoo"{{.*}} : (!fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>) -> (!fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>, !fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>)
+!CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QPfunc) : (!fir.ref<f32>) -> f32
+!CHECK:           %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : ((!fir.ref<f32>) -> f32) -> !fir.boxproc<() -> ()>
+!CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> !fir.boxproc<(!fir.ref<f32>) -> f32>
+!CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>
+
+subroutine test_local()
+  use m_gen, only : func
+  procedure(func), pointer :: foo
+  interface foo
+     procedure :: foo
+  end interface
+  foo => func
+end subroutine
+!CHECK-LABEL:   func.func @_QPtest_local() {
+!CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.boxproc<(!fir.ref<f32>) -> f32> {bindc_name = "foo", uniq_name = "_QFtest_localEfoo"}
+!CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}}"_QFtest_localEfoo"{{.*}} : (!fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>) -> (!fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>, !fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>)
+!CHECK:           %[[VAL_4:.*]] = fir.address_of(@_QPfunc) : (!fir.ref<f32>) -> f32
+!CHECK:           %[[VAL_5:.*]] = fir.emboxproc %[[VAL_4]] : ((!fir.ref<f32>) -> f32) -> !fir.boxproc<() -> ()>
+!CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.boxproc<() -> ()>) -> !fir.boxproc<(!fir.ref<f32>) -> f32>
+!CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]]#0 : !fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>
diff --git a/flang/test/Semantics/Inputs/modfile66.cuf b/flang/test/Semantics/Inputs/modfile66.cuf
new file mode 100644
index 0000000000000..be400da749148
--- /dev/null
+++ b/flang/test/Semantics/Inputs/modfile66.cuf
@@ -0,0 +1,4 @@
+module usereal2
+  !REAL(2) is interoperable under CUDA
+  real(2), bind(c) :: x
+end
diff --git a/flang/test/Semantics/Inputs/modfile67.mod b/flang/test/Semantics/Inputs/modfile67.mod
new file mode 100644
index 0000000000000..1aa0158e35089
--- /dev/null
+++ b/flang/test/Semantics/Inputs/modfile67.mod
@@ -0,0 +1,16 @@
+﻿!mod$ v1 sum:37cfecee3234c8ab
+module modfile67
+type::t
+procedure(foo),nopass,pointer::p
+end type
+contains
+pure function foo(n,a) result(r)
+integer(4),intent(in)::n
+real(4),intent(in)::a(1_8:int(n,kind=8))
+logical(4)::r(1_8:int(int(max(0_8,int(n,kind=8)),kind=4),kind=8))
+end
+function fooptr(f)
+procedure(foo)::f
+type(t)::fooptr
+end
+end
diff --git a/flang/test/Semantics/generic10.f90 b/flang/test/Semantics/generic10.f90
new file mode 100644
index 0000000000000..203d0bb855208
--- /dev/null
+++ b/flang/test/Semantics/generic10.f90
@@ -0,0 +1,17 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+module m
+  procedure(func), pointer :: foo
+  interface foo
+     procedure :: foo
+  end interface
+ contains
+  function func(x)
+    func = x
+  end
+end
+
+program main
+  use m
+!CHECK: foo => func
+  foo => func
+end
diff --git a/flang/test/Semantics/modfile66.f90 b/flang/test/Semantics/modfile66.f90
new file mode 100644
index 0000000000000..51b4d8375d50d
--- /dev/null
+++ b/flang/test/Semantics/modfile66.f90
@@ -0,0 +1,3 @@
+! RUN: %flang_fc1 -fsyntax-only %S/Inputs/modfile66.cuf && %flang_fc1 -fsyntax-only %s
+use usereal2 ! valid since x is not used
+end
diff --git a/flang/test/Semantics/modfile67.f90 b/flang/test/Semantics/modfile67.f90
new file mode 100644
index 0000000000000..18cf95bd42fbf
--- /dev/null
+++ b/flang/test/Semantics/modfile67.f90
@@ -0,0 +1,35 @@
+!RUN: %flang_fc1 -fsyntax-only -J%S/Inputs %s
+
+#if 0
+!modfile67.mod was produced from this source, and must be read into this
+!compilation from its module file in order to truly test this fix.
+module modfile67
+  type t
+    procedure(foo), nopass, pointer :: p
+  end type
+ contains
+  pure function foo(n,a) result(r)
+    integer, intent(in) :: n
+    real, intent(in), dimension(n) :: a
+    logical, dimension(size(a)) :: r
+    r = .false.
+  end
+  type(t) function fooptr(f)
+    procedure(foo) f
+    fooptr%p => f
+  end
+end
+#endif
+
+program test
+  use modfile67
+  type(t) x
+  x = fooptr(bar) ! ensure no bogus error about procedure incompatibility
+ contains
+  pure function bar(n,a) result(r)
+    integer, intent(in) :: n
+    real, intent(in), dimension(n) :: a
+    logical, dimension(size(a)) :: r
+    r = .false.
+  end
+end
diff --git a/flang/test/Semantics/resolve24.f90 b/flang/test/Semantics/resolve24.f90
index 4af6f202cf4f1..72d6719665bb5 100644
--- a/flang/test/Semantics/resolve24.f90
+++ b/flang/test/Semantics/resolve24.f90
@@ -1,6 +1,6 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
 subroutine test1
-  !ERROR: Generic interface 'foo' has both a function and a subroutine
+  !WARNING: Generic interface 'foo' has both a function and a subroutine
   interface foo
     subroutine s1(x)
     end subroutine
@@ -12,7 +12,7 @@ function f()
 end subroutine
 
 subroutine test2
-  !ERROR: Generic interface 'foo' has both a function and a subroutine
+  !WARNING: Generic interface 'foo' has both a function and a subroutine
   interface foo
     function t2f1(x)
     end function
@@ -24,7 +24,7 @@ function t2f2(x, y)
 end subroutine
 
 module test3
-  !ERROR: Generic interface 'foo' has both a function and a subroutine
+  !WARNING: Generic interface 'foo' has both a function and a subroutine
   interface foo
     module procedure s
     module procedure f
@@ -39,7 +39,7 @@ function f()
 subroutine test4
   type foo
   end type
-  !ERROR: Generic interface 'foo' may only contain functions due to derived type with same name
+  !WARNING: Generic interface 'foo' should only contain functions due to derived type with same name
   interface foo
     subroutine s()
     end subroutine
diff --git a/flang/test/Semantics/resolve69.f90 b/flang/test/Semantics/resolve69.f90
index e1f7773eee9da..5acfd30604fe3 100644
--- a/flang/test/Semantics/resolve69.f90
+++ b/flang/test/Semantics/resolve69.f90
@@ -16,7 +16,7 @@ subroutine s1()
   !
   integer, parameter :: constVal = 1
   integer :: nonConstVal = 1
-!ERROR: Invalid specification expression: reference to local entity 'nonconstval'
+!PORTABILITY: specification expression refers to local object 'nonconstval' (initialized and saved)
   character(nonConstVal) :: colonString1
   character(len=20, kind=constVal + 1) :: constKindString
   character(len=:, kind=constVal + 1), pointer :: constKindString1
@@ -53,13 +53,13 @@ function foo3()
 
   type (derived(constVal, 3)) :: constDerivedKind
 !ERROR: Value of KIND type parameter 'typekind' must be constant
-!ERROR: Invalid specification expression: reference to local entity 'nonconstval'
+!PORTABILITY: specification expression refers to local object 'nonconstval' (initialized and saved)
   type (derived(nonConstVal, 3)) :: nonConstDerivedKind
 
   !OK because all type-params are constants
   type (derived(3, constVal)) :: constDerivedLen
 
-!ERROR: Invalid specification expression: reference to local entity 'nonconstval'
+!PORTABILITY: specification expression refers to local object 'nonconstval' (initialized and saved)
   type (derived(3, nonConstVal)) :: nonConstDerivedLen
 !ERROR: 'colonderivedlen' has a type derived(typekind=3_4,typelen=:) with a deferred type parameter but is neither an allocatable nor an object pointer
   type (derived(3, :)) :: colonDerivedLen
diff --git a/flang/test/Semantics/resolve77.f90 b/flang/test/Semantics/resolve77.f90
index 943993ee74d76..0133fac3bfbc5 100644
--- a/flang/test/Semantics/resolve77.f90
+++ b/flang/test/Semantics/resolve77.f90
@@ -60,6 +60,7 @@ pure integer function if2(n)
 block data
   common /blk2/ n
   data n/100/
+  !PORTABILITY: specification expression refers to local object 'n' (initialized and saved)
   !ERROR: Automatic data object 'a' may not appear in a BLOCK DATA subprogram
   real a(n)
 end
diff --git a/flang/test/Semantics/spec-expr.f90 b/flang/test/Semantics/spec-expr.f90
index aa010ed0bf7ed..9d209c3583b43 100644
--- a/flang/test/Semantics/spec-expr.f90
+++ b/flang/test/Semantics/spec-expr.f90
@@ -104,7 +104,7 @@ subroutine s7biii(x, y)
   integer :: local = 5
   ! OK, since "localConst" is a constant
   real, dimension(localConst) :: realArray1
-  !ERROR: Invalid specification expression: reference to local entity 'local'
+  !PORTABILITY: specification expression refers to local object 'local' (initialized and saved)
   real, dimension(local) :: realArray2
   real, dimension(size(realArray1)) :: realArray3 ! ok
   real, dimension(size(x)) :: realArray4 ! ok
diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90
index 0d381f10b0483..b6f0e2e12ff6f 100644
--- a/flang/test/Semantics/typeinfo01.f90
+++ b/flang/test/Semantics/typeinfo01.f90
@@ -102,8 +102,8 @@ impure elemental subroutine s1(x, y)
     class(t), intent(out) :: x
     class(t), intent(in) :: y
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=16_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=4_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 end module
 
@@ -125,8 +125,8 @@ impure elemental subroutine s3(x)
   subroutine s4(x)
     type(t), contiguous :: x(:,:,:)
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=29184_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=9_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=13_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=14_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)]
 end module
 
 module m09
@@ -167,8 +167,8 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=480_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=7_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=8_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)]
 end module
 
@@ -216,8 +216,8 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=480_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=8_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
 end module
 
 module m11
@@ -260,7 +260,7 @@ module m13
    contains
     procedure :: assign1, assign2
     generic :: assignment(=) => assign1, assign2
-    ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=assign1)]
+    ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=4_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=assign1)]
   end type
  contains
   impure elemental subroutine assign1(to, from)
diff --git a/flang/test/Semantics/typeinfo02.f90 b/flang/test/Semantics/typeinfo02.f90
index 29d14c7a0f196..2b911e7238f88 100644
--- a/flang/test/Semantics/typeinfo02.f90
+++ b/flang/test/Semantics/typeinfo02.f90
@@ -29,5 +29,5 @@ subroutine wf2(x,u,iot,v,iostat,iomsg)
     character(len=*), intent(inout) :: iomsg
   end subroutine
 end module
-!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf1)]
-!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf2)]
+!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf1)]
+!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf2)]
diff --git a/flang/test/Semantics/typeinfo04.f90 b/flang/test/Semantics/typeinfo04.f90
index de8464321a409..2527f656da3d1 100644
--- a/flang/test/Semantics/typeinfo04.f90
+++ b/flang/test/Semantics/typeinfo04.f90
@@ -7,7 +7,7 @@ module m
    contains
     final :: final
   end type
-!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=512_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
   type, abstract :: t1
   end type
 !CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
diff --git a/flang/test/Semantics/typeinfo12.f90 b/flang/test/Semantics/typeinfo12.f90
new file mode 100644
index 0000000000000..983e09be0055b
--- /dev/null
+++ b/flang/test/Semantics/typeinfo12.f90
@@ -0,0 +1,52 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s
+
+! Test defined assignment with allocatable / pointer LHS arguments.
+! The special bindings for the defined assignments must reflect that
+! their LHS arguments are allocatables and pointers.
+! (This program is executable and should print 1; 102; 3 204.)
+
+module m
+  type :: base
+     integer :: i
+  contains
+     procedure, pass(src) :: ass1, ass2
+     generic :: assignment(=) => ass1, ass2
+  end type base
+  type, extends(base) :: derived
+  end type
+
+!CHECK: .dt.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.base,name=.n.base,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.base,procptr=NULL(),special=.s.base,specialbitset=12_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.derived, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.derived,name=.n.derived,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.derived,procptr=NULL(),special=.s.derived,specialbitset=12_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:1_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass1),specialbinding(which=3_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass2)]
+!CHECK: .s.derived, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:1_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass1),specialbinding(which=3_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass2)]
+
+contains
+  subroutine ass1(res, src)
+    class(base), allocatable, intent(out) :: res
+    class(base), intent(in) :: src
+    allocate(res, source=src)
+    res%i = res%i + 100
+  end subroutine
+  subroutine ass2(res, src)
+    class(base), pointer, intent(in out) :: res
+    class(base), intent(in) :: src
+    allocate(res, source=src)
+    res%i = src%i + 200
+  end subroutine
+end
+program genext
+  use m
+  type(derived) :: od1
+  class(base), allocatable :: od2
+  class(base), pointer :: od3a, od3b
+  od1 = derived(1)
+  print *, od1%i
+  od2 = derived(2)
+  print *, od2%i
+  allocate(od3a)
+  od3a%i = 3
+  od3b => od3a
+  od3b = derived(4)
+  print *, od3a%i, od3b%i
+end program genext
diff --git a/flang/test/Transforms/debug-90683.fir b/flang/test/Transforms/debug-90683.fir
index cc6929c10411f..a21332e3968a7 100644
--- a/flang/test/Transforms/debug-90683.fir
+++ b/flang/test/Transforms/debug-90683.fir
@@ -22,4 +22,4 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
 
 // CHECK-DAG: #[[TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
 // CHECK-DAG: #[[TY1:.*]] = #llvm.di_subroutine_type<callingConvention = DW_CC_normal, types = #[[TY]], #[[TY]], #[[TY]]>
-// CHECK-DAG: #{{.*}} = #llvm.di_subprogram<scope = #{{.*}}, name = "cabs", linkageName = "cabs", file = #{{.*}}, line = {{.*}}, scopeLine = {{.*}}, type = #[[TY1]]>
+// CHECK-DAG: #{{.*}} = #llvm.di_subprogram<{{.*}}name = "cabs", linkageName = "cabs"{{.*}}, type = #[[TY1]]>
diff --git a/flang/test/Transforms/debug-fn-info.fir b/flang/test/Transforms/debug-fn-info.fir
index f456e35d3dd70..5433e088a648d 100644
--- a/flang/test/Transforms/debug-fn-info.fir
+++ b/flang/test/Transforms/debug-fn-info.fir
@@ -69,7 +69,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
 // CHECK: #[[TY2:.*]] = #llvm.di_subroutine_type<callingConvention = DW_CC_normal, types = #[[INT4]], #[[INT8]], #[[REAL4]], #[[LOG4]]>
 
 // Line numbers should match the number in corresponding loc entry.
-// CHECK: #llvm.di_subprogram<id = {{.*}}, compileUnit = {{.*}}, scope = {{.*}}, name = "_QQmain", linkageName = "_QQmain", file = {{.*}}, line = 15, scopeLine = 15, subprogramFlags = Definition, type = #[[TY0]]>
-// CHECK: #llvm.di_subprogram<id = {{.*}}, compileUnit = {{.*}}, scope = {{.*}}, name = "fn1", linkageName = "_QFPfn1", file = {{.*}}, line = 26, scopeLine = 26, subprogramFlags = Definition, type = #[[TY1]]>
-// CHECK: #llvm.di_subprogram<id = {{.*}}, compileUnit = {{.*}}, scope = {{.*}}, name = "fn2", linkageName = "_QFPfn2", file = {{.*}}, line = 43, scopeLine = 43, subprogramFlags = Definition, type = #[[TY2]]>
+// CHECK: #llvm.di_subprogram<{{.*}}name = "_QQmain", linkageName = "_QQmain", file = {{.*}}, line = 15, scopeLine = 15, subprogramFlags = Definition, type = #[[TY0]]>
+// CHECK: #llvm.di_subprogram<{{.*}}name = "fn1", linkageName = "_QFPfn1", file = {{.*}}, line = 26, scopeLine = 26, subprogramFlags = Definition, type = #[[TY1]]>
+// CHECK: #llvm.di_subprogram<{{.*}}name = "fn2", linkageName = "_QFPfn2", file = {{.*}}, line = 43, scopeLine = 43, subprogramFlags = Definition, type = #[[TY2]]>
 
diff --git a/flang/test/Transforms/debug-imported-entity.fir b/flang/test/Transforms/debug-imported-entity.fir
new file mode 100644
index 0000000000000..7be6531a703a8
--- /dev/null
+++ b/flang/test/Transforms/debug-imported-entity.fir
@@ -0,0 +1,30 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
+  fir.global @_QMfooEv1 : i32 {
+    %0 = fir.zero_bits i32
+    fir.has_value %0 : i32
+  }
+  fir.global internal @_QFtestExyz : i32 {
+    %c12_i32 = arith.constant 12 : i32
+    fir.has_value %c12_i32 : i32
+  } loc(#loc4)
+  func.func @test() attributes {fir.bindc_name = "test"} {
+    %0 = fir.address_of(@_QMfooEv1) : !fir.ref<i32>
+    %1 = fircg.ext_declare %0 {uniq_name = "_QMfooEv1"} : (!fir.ref<i32>) -> !fir.ref<i32> loc(#loc1)
+    %4 = fir.address_of(@_QFtestExyz) : !fir.ref<i32>
+    %5 = fircg.ext_declare %4 {uniq_name = "_QFtestExyz"} : (!fir.ref<i32>) -> !fir.ref<i32> loc(#loc4)
+    return
+  } loc(#loc3)
+}
+#loc1 = loc("test.f90":2:14)
+#loc2 = loc("test.f90":6:1)
+#loc3 = loc("test.f90":10:1)
+#loc4 = loc("test.f90":13:1)
+
+// CHECK: #[[MOD:.+]] = #llvm.di_module<{{.*}}name = "foo"{{.*}}>
+// CHECK: #[[SP_REC:.+]] = #llvm.di_subprogram<recId = distinct[[[REC_ID:[0-9]+]]]<>, isRecSelf = true{{.*}}>
+// CHECK: #[[IMP_ENTITY:.+]] = #llvm.di_imported_entity<tag = DW_TAG_imported_module, scope = #[[SP_REC]], entity = #[[MOD]]{{.*}}>
+// CHECK: #[[SP:.+]] = #llvm.di_subprogram<recId = distinct[[[REC_ID]]]<>{{.*}}retainedNodes = #[[IMP_ENTITY]]>
+// CHECK: #llvm.di_global_variable<scope = #[[SP]], name = "xyz"{{.*}}>
diff --git a/flang/test/Transforms/debug-line-table-inc-file.fir b/flang/test/Transforms/debug-line-table-inc-file.fir
index 065039b59c5ae..216cd5e016f2f 100644
--- a/flang/test/Transforms/debug-line-table-inc-file.fir
+++ b/flang/test/Transforms/debug-line-table-inc-file.fir
@@ -31,7 +31,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
 // CHECK: #[[LOC_INC_FILE:.*]] = loc("{{.*}}inc.f90":1:1)
 // CHECK: #[[LOC_FILE:.*]] = loc("{{.*}}simple.f90":3:1)
 // CHECK: #[[DI_CU:.*]] = #llvm.di_compile_unit<id = distinct[{{.*}}]<>, sourceLanguage = DW_LANG_Fortran95, file = #[[DI_FILE]], producer = "{{.*}}flang{{.*}}", isOptimized = false, emissionKind = LineTablesOnly>
-// CHECK: #[[DI_SP_INC:.*]] = #llvm.di_subprogram<id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "sinc", linkageName = "_QPsinc", file = #[[DI_INC_FILE]], {{.*}}>
-// CHECK: #[[DI_SP:.*]] = #llvm.di_subprogram<id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "_QQmain", linkageName = "_QQmain", file = #[[DI_FILE]], {{.*}}>
+// CHECK: #[[DI_SP_INC:.*]] = #llvm.di_subprogram<{{.*}}id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "sinc", linkageName = "_QPsinc", file = #[[DI_INC_FILE]], {{.*}}>
+// CHECK: #[[DI_SP:.*]] = #llvm.di_subprogram<{{.*}}id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "_QQmain", linkageName = "_QQmain", file = #[[DI_FILE]], {{.*}}>
 // CHECK: #[[FUSED_LOC_INC_FILE]] = loc(fused<#[[DI_SP_INC]]>[#[[LOC_INC_FILE]]])
 // CHECK: #[[FUSED_LOC_FILE]] = loc(fused<#[[DI_SP]]>[#[[LOC_FILE]]])
diff --git a/flang/test/Transforms/debug-local-global-storage-1.fir b/flang/test/Transforms/debug-local-global-storage-1.fir
index d9d8083a14709..83a9055a6b8dc 100644
--- a/flang/test/Transforms/debug-local-global-storage-1.fir
+++ b/flang/test/Transforms/debug-local-global-storage-1.fir
@@ -45,7 +45,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i64, dense<64> :
 // CHECK-DAG: #[[CU:.*]] = #llvm.di_compile_unit<{{.*}}>
 // CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}scope = #[[CU]]{{.*}}name = "example"{{.*}}>
 // CHECK-DAG: #[[SP:.*]] = #llvm.di_subprogram<{{.*}}name = "_QQmain"{{.*}}>
-// CHECK-DAG: #[[MOD_SP:.*]] = #llvm.di_subprogram<{{.*}}name = "mod_sub"{{.*}}>
+// CHECK-DAG: #[[MOD_SP:.*]] = #llvm.di_subprogram<{{.*}}name = "mod_sub"{{.*}}retainedNodes = {{.*}}>
 // CHECK-DAG: #llvm.di_global_variable<scope = #[[SP]], name = "arr"{{.*}}line = 22{{.*}}>
 // CHECK-DAG: #llvm.di_global_variable<scope = #[[SP]], name = "s"{{.*}}line = 23{{.*}}>
 // CHECK-DAG: #llvm.di_global_variable<scope = #[[MOD_SP]], name = "ss"{{.*}}line = 12{{.*}}>
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index 344a781c41e95..9d7b8633958cb 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -72,9 +72,6 @@ if (NOT CMAKE_CROSSCOMPILING)
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_builtins.mod)
     else()
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_builtins.mod)
-      if(NOT ${filename} STREQUAL "__fortran_type_info")
-        set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_type_info.mod)
-      endif()
       if(${filename} STREQUAL "iso_fortran_env")
         set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/iso_fortran_env_impl.mod)
       endif()
@@ -83,6 +80,9 @@ if (NOT CMAKE_CROSSCOMPILING)
         set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_ieee_exceptions.mod)
       endif()
     endif()
+    if(NOT ${filename} STREQUAL "__fortran_type_info" AND NOT ${filename} STREQUAL "__fortran_builtins")
+      set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_type_info.mod)
+    endif()
 
     # The module contains PPC vector types that needs the PPC target.
     if(${filename} STREQUAL "__ppc_intrinsics" OR
diff --git a/libcxx/.clang-format b/libcxx/.clang-format
index b2ca452931fec..c37b234e857de 100644
--- a/libcxx/.clang-format
+++ b/libcxx/.clang-format
@@ -44,7 +44,6 @@ AttributeMacros: [
                   '_LIBCPP_NO_UNIQUE_ADDRESS',
                   '_LIBCPP_NOALIAS',
                   '_LIBCPP_NODISCARD',
-                  '_LIBCPP_NORETURN',
                   '_LIBCPP_OVERRIDABLE_FUNC_VIS',
                   '_LIBCPP_STANDALONE_DEBUG',
                   '_LIBCPP_TEMPLATE_DATA_VIS',
diff --git a/libcxx/include/__chrono/exception.h b/libcxx/include/__chrono/exception.h
index 266f8fac44176..cc408d78a36da 100644
--- a/libcxx/include/__chrono/exception.h
+++ b/libcxx/include/__chrono/exception.h
@@ -71,7 +71,7 @@ class nonexistent_local_time : public runtime_error {
 };
 
 template <class _Duration>
-_LIBCPP_NORETURN _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_nonexistent_local_time(
+[[noreturn]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_nonexistent_local_time(
     [[maybe_unused]] const local_time<_Duration>& __time, [[maybe_unused]] const local_info& __info) {
 #    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw nonexistent_local_time(__time, __info);
@@ -115,7 +115,7 @@ class ambiguous_local_time : public runtime_error {
 };
 
 template <class _Duration>
-_LIBCPP_NORETURN _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_ambiguous_local_time(
+[[noreturn]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_ambiguous_local_time(
     [[maybe_unused]] const local_time<_Duration>& __time, [[maybe_unused]] const local_info& __info) {
 #    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw ambiguous_local_time(__time, __info);
diff --git a/libcxx/include/__config b/libcxx/include/__config
index bccf90d1dbacd..b0a5dda147a6a 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -312,7 +312,6 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ALIGNOF(_Tp) alignof(_Tp)
 #    define _ALIGNAS_TYPE(x) alignas(x)
 #    define _ALIGNAS(x) alignas(x)
-#    define _LIBCPP_NORETURN [[noreturn]]
 #    define _NOEXCEPT noexcept
 #    define _NOEXCEPT_(...) noexcept(__VA_ARGS__)
 #    define _LIBCPP_CONSTEXPR constexpr
@@ -322,7 +321,6 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ALIGNOF(_Tp) _Alignof(_Tp)
 #    define _ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCPP_ALIGNOF(x))))
 #    define _ALIGNAS(x) __attribute__((__aligned__(x)))
-#    define _LIBCPP_NORETURN __attribute__((__noreturn__))
 #    define _LIBCPP_HAS_NO_NOEXCEPT
 #    define nullptr __nullptr
 #    define _NOEXCEPT throw()
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index beadd9212abd1..9e5351f534a1c 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -159,7 +159,7 @@ _LIBCPP_EXPORTED_FROM_ABI void swap(exception_ptr&, exception_ptr&) _NOEXCEPT;
 
 _LIBCPP_EXPORTED_FROM_ABI exception_ptr __copy_exception_ptr(void* __except, const void* __ptr);
 _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT;
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr);
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr);
 
 // This is a built-in template function which automagically extracts the required
 // information.
diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h
index 4c7970d167ffa..8e817e1c06978 100644
--- a/libcxx/include/__exception/nested_exception.h
+++ b/libcxx/include/__exception/nested_exception.h
@@ -40,7 +40,7 @@ class _LIBCPP_EXPORTED_FROM_ABI nested_exception {
   virtual ~nested_exception() _NOEXCEPT;
 
   // access functions
-  _LIBCPP_NORETURN void rethrow_nested() const;
+  [[__noreturn__]] void rethrow_nested() const;
   _LIBCPP_HIDE_FROM_ABI exception_ptr nested_ptr() const _NOEXCEPT { return __ptr_; }
 };
 
@@ -55,19 +55,19 @@ struct __throw_with_nested;
 
 template <class _Tp, class _Up>
 struct __throw_with_nested<_Tp, _Up, true> {
-  _LIBCPP_NORETURN static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) {
+  [[__noreturn__]] static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) {
     throw __nested<_Up>(std::forward<_Tp>(__t));
   }
 };
 
 template <class _Tp, class _Up>
 struct __throw_with_nested<_Tp, _Up, false> {
-  _LIBCPP_NORETURN static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) { throw std::forward<_Tp>(__t); }
+  [[__noreturn__]] static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) { throw std::forward<_Tp>(__t); }
 };
 #endif
 
 template <class _Tp>
-_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void throw_with_nested(_Tp&& __t) {
+[[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void throw_with_nested(_Tp&& __t) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   using _Up = __decay_t<_Tp>;
   static_assert(is_copy_constructible<_Up>::value, "type thrown must be CopyConstructible");
diff --git a/libcxx/include/__exception/operations.h b/libcxx/include/__exception/operations.h
index 4a0a697c00e6e..c8744eb297a4e 100644
--- a/libcxx/include/__exception/operations.h
+++ b/libcxx/include/__exception/operations.h
@@ -22,7 +22,7 @@ namespace std { // purposefully not using versioning namespace
 using unexpected_handler = void (*)();
 _LIBCPP_EXPORTED_FROM_ABI unexpected_handler set_unexpected(unexpected_handler) _NOEXCEPT;
 _LIBCPP_EXPORTED_FROM_ABI unexpected_handler get_unexpected() _NOEXCEPT;
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void unexpected();
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void unexpected();
 #endif
 
 using terminate_handler = void (*)();
@@ -37,7 +37,7 @@ _LIBCPP_EXPORTED_FROM_ABI int uncaught_exceptions() _NOEXCEPT;
 class _LIBCPP_EXPORTED_FROM_ABI exception_ptr;
 
 _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT;
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr);
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr);
 } // namespace std
 
 #endif // _LIBCPP___EXCEPTION_OPERATIONS_H
diff --git a/libcxx/include/__exception/terminate.h b/libcxx/include/__exception/terminate.h
index e672471dc5263..0bfc3506d3791 100644
--- a/libcxx/include/__exception/terminate.h
+++ b/libcxx/include/__exception/terminate.h
@@ -16,7 +16,7 @@
 #endif
 
 namespace std { // purposefully not using versioning namespace
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void terminate() _NOEXCEPT;
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void terminate() _NOEXCEPT;
 } // namespace std
 
 #endif // _LIBCPP___EXCEPTION_TERMINATE_H
diff --git a/libcxx/include/__filesystem/filesystem_error.h b/libcxx/include/__filesystem/filesystem_error.h
index 80a11e3b1932c..f43568c2004d2 100644
--- a/libcxx/include/__filesystem/filesystem_error.h
+++ b/libcxx/include/__filesystem/filesystem_error.h
@@ -69,13 +69,13 @@ class _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY _LIBCPP_EXPORTED_FROM_ABI filesyst
 
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
 template <class... _Args>
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void
 __throw_filesystem_error(_Args&&... __args) {
   throw filesystem_error(std::forward<_Args>(__args)...);
 }
 #  else
 template <class... _Args>
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void
 __throw_filesystem_error(_Args&&...) {
   _LIBCPP_VERBOSE_ABORT("filesystem_error was thrown in -fno-exceptions mode");
 }
diff --git a/libcxx/include/__format/format_error.h b/libcxx/include/__format/format_error.h
index 35a39ee82f3da..1df7dbff2b7df 100644
--- a/libcxx/include/__format/format_error.h
+++ b/libcxx/include/__format/format_error.h
@@ -35,7 +35,7 @@ class _LIBCPP_EXPORTED_FROM_ABI format_error : public runtime_error {
 };
 _LIBCPP_DIAGNOSTIC_POP
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_format_error(const char* __s) {
+[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI void __throw_format_error(const char* __s) {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw format_error(__s);
 #  else
diff --git a/libcxx/include/__format/parser_std_format_spec.h b/libcxx/include/__format/parser_std_format_spec.h
index 28891e5d2876c..6bdf8e319ba44 100644
--- a/libcxx/include/__format/parser_std_format_spec.h
+++ b/libcxx/include/__format/parser_std_format_spec.h
@@ -52,13 +52,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace __format_spec {
 
-_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void
+[[noreturn]] _LIBCPP_HIDE_FROM_ABI inline void
 __throw_invalid_option_format_error(const char* __id, const char* __option) {
   std::__throw_format_error(
       (string("The format specifier for ") + __id + " does not allow the " + __option + " option").c_str());
 }
 
-_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void __throw_invalid_type_format_error(const char* __id) {
+[[noreturn]] _LIBCPP_HIDE_FROM_ABI inline void __throw_invalid_type_format_error(const char* __id) {
   std::__throw_format_error(
       (string("The type option contains an invalid value for ") + __id + " formatting argument").c_str());
 }
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index c7b98035e34bf..ff31011caa329 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -78,7 +78,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_function_call : public exception {
 };
 _LIBCPP_DIAGNOSTIC_POP
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_function_call() {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_function_call() {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_function_call();
 #  else
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 4dd8022822d22..5dcd475e2c9f9 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -123,7 +123,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_weak_ptr : public std::exception {
   const char* what() const _NOEXCEPT override;
 };
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_weak_ptr() {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_weak_ptr() {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_weak_ptr();
 #else
diff --git a/libcxx/include/__system_error/system_error.h b/libcxx/include/__system_error/system_error.h
index 362e67505658c..3ffa1029ca5c2 100644
--- a/libcxx/include/__system_error/system_error.h
+++ b/libcxx/include/__system_error/system_error.h
@@ -39,8 +39,8 @@ class _LIBCPP_EXPORTED_FROM_ABI system_error : public runtime_error {
   _LIBCPP_HIDE_FROM_ABI const error_code& code() const _NOEXCEPT { return __ec_; }
 };
 
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_system_error(int __ev, const char* __what_arg);
-_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void __throw_system_error(error_code __ec, const char* __what_arg) {
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_system_error(int __ev, const char* __what_arg);
+[[__noreturn__]] _LIBCPP_HIDE_FROM_ABI inline void __throw_system_error(error_code __ec, const char* __what_arg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw system_error(__ec, __what_arg);
 #else
diff --git a/libcxx/include/__utility/unreachable.h b/libcxx/include/__utility/unreachable.h
index d833f74c2e4f1..5525452aa55ef 100644
--- a/libcxx/include/__utility/unreachable.h
+++ b/libcxx/include/__utility/unreachable.h
@@ -18,7 +18,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void __libcpp_unreachable() {
+[[__noreturn__]] _LIBCPP_HIDE_FROM_ABI inline void __libcpp_unreachable() {
   _LIBCPP_ASSERT_INTERNAL(false, "std::unreachable() was reached");
   __builtin_unreachable();
 }
diff --git a/libcxx/include/__verbose_abort b/libcxx/include/__verbose_abort
index 195ce65b721ff..244278aec652d 100644
--- a/libcxx/include/__verbose_abort
+++ b/libcxx/include/__verbose_abort
@@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // This function should never be called directly from the code -- it should only be called through
 // the _LIBCPP_VERBOSE_ABORT macro.
-_LIBCPP_NORETURN _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS
+[[__noreturn__]] _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS
 _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...);
 
 // _LIBCPP_VERBOSE_ABORT(format, args...)
diff --git a/libcxx/include/any b/libcxx/include/any
index 7630e8a057d05..6e4ff31ff9b62 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -127,7 +127,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 17
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST void __throw_bad_any_cast() {
+[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST void __throw_bad_any_cast() {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_any_cast();
 #  else
diff --git a/libcxx/include/future b/libcxx/include/future
index 01c0b10172cd3..9a0eb7971a313 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -472,7 +472,7 @@ inline _LIBCPP_HIDE_FROM_ABI error_condition make_error_condition(future_errc __
   return error_condition(static_cast<int>(__e), future_category());
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_future_error(future_errc __ev);
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_future_error(future_errc __ev);
 
 class _LIBCPP_EXPORTED_FROM_ABI future_error : public logic_error {
   error_code __ec_;
diff --git a/libcxx/include/ios b/libcxx/include/ios
index 426838b91e5dc..61a05fadd29a1 100644
--- a/libcxx/include/ios
+++ b/libcxx/include/ios
@@ -440,7 +440,7 @@ public:
   ~failure() _NOEXCEPT override;
 };
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_failure(char const* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_failure(char const* __msg) {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw ios_base::failure(__msg);
 #  else
diff --git a/libcxx/include/new b/libcxx/include/new
index 9015c4e712763..207e4b46e0ca6 100644
--- a/libcxx/include/new
+++ b/libcxx/include/new
@@ -166,9 +166,9 @@ public:
 };
 #endif // defined(_LIBCPP_ABI_VCRUNTIME) && defined(_HAS_EXCEPTIONS) && _HAS_EXCEPTIONS == 0
 
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_bad_alloc(); // not in C++ spec
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_bad_alloc(); // not in C++ spec
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_array_new_length() {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_array_new_length() {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_array_new_length();
 #else
diff --git a/libcxx/include/optional b/libcxx/include/optional
index 41d7515a2b689..b0933b59b25d2 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -255,7 +255,7 @@ public:
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS void
+[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS void
 __throw_bad_optional_access() {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_optional_access();
diff --git a/libcxx/include/regex b/libcxx/include/regex
index 08aebc2266f5d..d59abb8daf8ec 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -983,7 +983,7 @@ public:
 };
 
 template <regex_constants::error_type _Ev>
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_regex_error() {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_regex_error() {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw regex_error(_Ev);
 #else
diff --git a/libcxx/include/stdexcept b/libcxx/include/stdexcept
index 853c185187c77..bdfc27aeac374 100644
--- a/libcxx/include/stdexcept
+++ b/libcxx/include/stdexcept
@@ -209,9 +209,9 @@ public:
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // in the dylib
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_runtime_error(const char*);
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_runtime_error(const char*);
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_logic_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_logic_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw logic_error(__msg);
 #else
@@ -219,7 +219,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_logic_error(const cha
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_domain_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_domain_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw domain_error(__msg);
 #else
@@ -227,7 +227,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_domain_error(const ch
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_invalid_argument(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_invalid_argument(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw invalid_argument(__msg);
 #else
@@ -235,7 +235,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_invalid_argument(cons
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_length_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_length_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw length_error(__msg);
 #else
@@ -243,7 +243,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_length_error(const ch
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw out_of_range(__msg);
 #else
@@ -251,7 +251,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range(const ch
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_range_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_range_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw range_error(__msg);
 #else
@@ -259,7 +259,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_range_error(const cha
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_overflow_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_overflow_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw overflow_error(__msg);
 #else
@@ -267,7 +267,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_overflow_error(const
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_underflow_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_underflow_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw underflow_error(__msg);
 #else
diff --git a/libcxx/include/string b/libcxx/include/string
index aba79a74912f5..46c5a5ac6de60 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -749,14 +749,6 @@ struct __can_be_converted_to_string_view
 struct __uninitialized_size_tag {};
 struct __init_with_sentinel_tag {};
 
-template <size_t _PaddingSize>
-struct __padding {
-  char __padding_[_PaddingSize];
-};
-
-template <>
-struct __padding<0> {};
-
 template <class _CharT, class _Traits, class _Allocator>
 class basic_string {
 private:
@@ -861,7 +853,7 @@ private:
 
   struct __short {
     value_type __data_[__min_cap];
-    _LIBCPP_NO_UNIQUE_ADDRESS __padding<sizeof(value_type) - 1> __padding_;
+    unsigned char __padding_[sizeof(value_type) - 1];
     unsigned char __size_    : 7;
     unsigned char __is_long_ : 1;
   };
@@ -913,7 +905,7 @@ private:
       unsigned char __is_long_ : 1;
       unsigned char __size_    : 7;
     };
-    _LIBCPP_NO_UNIQUE_ADDRESS __padding<sizeof(value_type) - 1> __padding_;
+    char __padding_[sizeof(value_type) - 1];
     value_type __data_[__min_cap];
   };
 
@@ -2238,11 +2230,11 @@ private:
     return std::__is_pointer_in_range(data(), data() + size() + 1, std::addressof(__v));
   }
 
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const {
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const {
     std::__throw_length_error("basic_string");
   }
 
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const {
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const {
     std::__throw_out_of_range("basic_string");
   }
 
diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index 54e0b4cf5d634..a44fa4d73ee58 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -373,7 +373,7 @@ private:
 #endif // defined(_LIBCPP_ABI_VCRUNTIME) && _HAS_EXCEPTIONS == 0
 
 _LIBCPP_BEGIN_NAMESPACE_STD
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_cast() {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_cast() {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_cast();
 #else
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 1367cd66f3701..1cac603c27c24 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -298,7 +298,7 @@ struct __farray {
   _LIBCPP_HIDE_FROM_ABI constexpr const _Tp& operator[](size_t __n) const noexcept { return __buf_[__n]; }
 };
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS void
+[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS void
 __throw_bad_variant_access() {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_variant_access();
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 2442852c764a6..fc0a48669fe53 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -995,9 +995,9 @@ private:
     __move_assign_alloc(__c, integral_constant<bool, __alloc_traits::propagate_on_container_move_assignment::value>());
   }
 
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); }
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); }
 
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); }
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector& __c, true_type) {
     if (__alloc() != __c.__alloc()) {
@@ -2163,9 +2163,9 @@ public:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __invariants() const;
 
 private:
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); }
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); }
 
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); }
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); }
 
   template <class _InputIterator, class _Sentinel>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
diff --git a/libcxx/src/stdexcept.cpp b/libcxx/src/stdexcept.cpp
index bc25c0f9e6ef6..134d28efb750f 100644
--- a/libcxx/src/stdexcept.cpp
+++ b/libcxx/src/stdexcept.cpp
@@ -19,7 +19,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-_LIBCPP_NORETURN void __throw_runtime_error(const char* msg) {
+void __throw_runtime_error(const char* msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw runtime_error(msg);
 #else
diff --git a/libcxx/src/string.cpp b/libcxx/src/string.cpp
index cf07b3ef1ef27..12db5381a7b1b 100644
--- a/libcxx/src/string.cpp
+++ b/libcxx/src/string.cpp
@@ -28,8 +28,8 @@ struct __basic_string_common;
 // The struct isn't declared anymore in the headers. It's only here for ABI compatibility.
 template <>
 struct __basic_string_common<true> {
-  _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const;
-  _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const;
+  [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const;
+  [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const;
 };
 
 void __basic_string_common<true>::__throw_length_error() const { std::__throw_length_error("basic_string"); }
diff --git a/libcxx/src/support/runtime/exception_fallback.ipp b/libcxx/src/support/runtime/exception_fallback.ipp
index 18ff4b83d8765..ca542c9497214 100644
--- a/libcxx/src/support/runtime/exception_fallback.ipp
+++ b/libcxx/src/support/runtime/exception_fallback.ipp
@@ -21,7 +21,7 @@ unexpected_handler set_unexpected(unexpected_handler func) noexcept {
 
 unexpected_handler get_unexpected() noexcept { return __libcpp_atomic_load(&__unexpected_handler); }
 
-_LIBCPP_NORETURN void unexpected() {
+[[noreturn]] void unexpected() {
   (*get_unexpected())();
   // unexpected handler should not return
   terminate();
@@ -33,7 +33,7 @@ terminate_handler set_terminate(terminate_handler func) noexcept {
 
 terminate_handler get_terminate() noexcept { return __libcpp_atomic_load(&__terminate_handler); }
 
-_LIBCPP_NORETURN void terminate() noexcept {
+[[noreturn]] void terminate() noexcept {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
diff --git a/libcxx/src/support/runtime/exception_msvc.ipp b/libcxx/src/support/runtime/exception_msvc.ipp
index 323cd9d180057..163aec057d9b5 100644
--- a/libcxx/src/support/runtime/exception_msvc.ipp
+++ b/libcxx/src/support/runtime/exception_msvc.ipp
@@ -32,7 +32,7 @@ unexpected_handler set_unexpected(unexpected_handler func) noexcept { return ::s
 
 unexpected_handler get_unexpected() noexcept { return ::_get_unexpected(); }
 
-_LIBCPP_NORETURN void unexpected() {
+[[noreturn]] void unexpected() {
   (*get_unexpected())();
   // unexpected handler should not return
   terminate();
@@ -42,7 +42,7 @@ terminate_handler set_terminate(terminate_handler func) noexcept { return ::set_
 
 terminate_handler get_terminate() noexcept { return ::_get_terminate(); }
 
-_LIBCPP_NORETURN void terminate() noexcept {
+[[noreturn]] void terminate() noexcept {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
diff --git a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
index bdb17b9996b7e..8f5c2060bb06c 100644
--- a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
@@ -40,7 +40,7 @@ nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {}
 
 nested_exception::~nested_exception() noexcept {}
 
-_LIBCPP_NORETURN void nested_exception::rethrow_nested() const {
+void nested_exception::rethrow_nested() const {
   if (__ptr_ == nullptr)
     terminate();
   rethrow_exception(__ptr_);
@@ -55,7 +55,7 @@ exception_ptr current_exception() noexcept {
   return ptr;
 }
 
-_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) {
+void rethrow_exception(exception_ptr p) {
   __cxa_rethrow_primary_exception(p.__ptr_);
   // if p.__ptr_ is NULL, above returns so we terminate
   terminate();
diff --git a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
index 6dad248f9e1fd..174b44ce0e6f7 100644
--- a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
@@ -31,7 +31,7 @@ struct exception_ptr {
 
 } // namespace __exception_ptr
 
-_LIBCPP_NORETURN void rethrow_exception(__exception_ptr::exception_ptr);
+[[noreturn]] void rethrow_exception(__exception_ptr::exception_ptr);
 
 exception_ptr::~exception_ptr() noexcept { reinterpret_cast<__exception_ptr::exception_ptr*>(this)->~exception_ptr(); }
 
@@ -55,13 +55,13 @@ exception_ptr exception_ptr::__from_native_exception_pointer(void* __e) noexcept
 
 nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {}
 
-_LIBCPP_NORETURN void nested_exception::rethrow_nested() const {
+[[noreturn]] void nested_exception::rethrow_nested() const {
   if (__ptr_ == nullptr)
     terminate();
   rethrow_exception(__ptr_);
 }
 
-_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) {
+[[noreturn]] void rethrow_exception(exception_ptr p) {
   rethrow_exception(reinterpret_cast<__exception_ptr::exception_ptr&>(p));
 }
 
diff --git a/libcxx/src/support/runtime/exception_pointer_msvc.ipp b/libcxx/src/support/runtime/exception_pointer_msvc.ipp
index b87742b32ded6..2be5136176e32 100644
--- a/libcxx/src/support/runtime/exception_pointer_msvc.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_msvc.ipp
@@ -61,13 +61,13 @@ exception_ptr current_exception() noexcept {
   return __ret;
 }
 
-_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) { __ExceptionPtrRethrow(&p); }
+[[noreturn]] void rethrow_exception(exception_ptr p) { __ExceptionPtrRethrow(&p); }
 
 nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {}
 
 nested_exception::~nested_exception() noexcept {}
 
-_LIBCPP_NORETURN void nested_exception::rethrow_nested() const {
+[[noreturn]] void nested_exception::rethrow_nested() const {
   if (__ptr_ == nullptr)
     terminate();
   rethrow_exception(__ptr_);
diff --git a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
index e12b0caf419d2..1fe3127f18b0b 100644
--- a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
@@ -44,7 +44,7 @@ nested_exception::~nested_exception() noexcept {}
 
 #endif
 
-_LIBCPP_NORETURN void nested_exception::rethrow_nested() const {
+[[noreturn]] void nested_exception::rethrow_nested() const {
 #warning exception_ptr not yet implemented
   fprintf(stderr, "exception_ptr not yet implemented\n");
   ::abort();
@@ -61,7 +61,7 @@ exception_ptr current_exception() noexcept {
   ::abort();
 }
 
-_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) {
+[[noreturn]] void rethrow_exception(exception_ptr p) {
 #warning exception_ptr not yet implemented
   fprintf(stderr, "exception_ptr not yet implemented\n");
   ::abort();
diff --git a/libcxx/src/vector.cpp b/libcxx/src/vector.cpp
index b6153b0e9bf99..3f3a906d6421f 100644
--- a/libcxx/src/vector.cpp
+++ b/libcxx/src/vector.cpp
@@ -17,8 +17,8 @@ struct __vector_base_common;
 
 template <>
 struct __vector_base_common<true> {
-  _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const;
-  _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const;
+  [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const;
+  [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const;
 };
 
 void __vector_base_common<true>::__throw_length_error() const { std::__throw_length_error("vector"); }
diff --git a/libcxx/test/support/assert_macros.h b/libcxx/test/support/assert_macros.h
index 1059823dcb246..b7011794025bf 100644
--- a/libcxx/test/support/assert_macros.h
+++ b/libcxx/test/support/assert_macros.h
@@ -50,7 +50,7 @@ void test_log(const char* condition, const char* file, int line, const F& functo
 }
 
 template <class Arg>
-TEST_NORETURN void test_fail(const char* file, int line, const Arg& arg) {
+[[noreturn]] void test_fail(const char* file, int line, const Arg& arg) {
   test_log("", file, line, arg);
   std::abort();
 }
diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index 329ce819a6c8d..47ebfeeeefc0f 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -142,7 +142,7 @@ std::string ToString(std::array<DeathCause, N> const& causes) {
   return ss.str();
 }
 
-TEST_NORETURN void StopChildProcess(DeathCause cause) { std::exit(static_cast<int>(cause)); }
+[[noreturn]] void StopChildProcess(DeathCause cause) { std::exit(static_cast<int>(cause)); }
 
 DeathCause ConvertToDeathCause(int val) {
   if (val < static_cast<int>(DeathCause::VerboseAbort) || val > static_cast<int>(DeathCause::Unknown)) {
@@ -260,7 +260,7 @@ class DeathTest {
   }
 
   template <class Func>
-  TEST_NORETURN void RunForChild(Func&& f) {
+  [[noreturn]] void RunForChild(Func&& f) {
     close(GetStdOutReadFD()); // don't need to read from the pipe in the child.
     close(GetStdErrReadFD());
     auto DupFD = [](int DestFD, int TargetFD) {
diff --git a/libcxx/test/support/count_new.h b/libcxx/test/support/count_new.h
index 61c8ca16ab0d0..c8169d3acceab 100644
--- a/libcxx/test/support/count_new.h
+++ b/libcxx/test/support/count_new.h
@@ -24,14 +24,13 @@
 
 namespace detail
 {
-   TEST_NORETURN
-   inline void throw_bad_alloc_helper() {
+[[noreturn]] inline void throw_bad_alloc_helper() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-       throw std::bad_alloc();
+  throw std::bad_alloc();
 #else
        std::abort();
 #endif
-   }
+}
 }
 
 class MemCounter
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 5d4c1a65cfafb..3aa818af1d269 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -214,12 +214,6 @@
 #define TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT
 #endif
 
-#if defined(_LIBCPP_NORETURN)
-#define TEST_NORETURN _LIBCPP_NORETURN
-#else
-#define TEST_NORETURN [[noreturn]]
-#endif
-
 #if defined(_LIBCPP_HAS_NO_ALIGNED_ALLOCATION) || \
   (!(TEST_STD_VER > 14 || \
     (defined(__cpp_aligned_new) && __cpp_aligned_new >= 201606L)))
diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index 060eb6c32004d..0f33885f7df37 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -1093,4 +1093,26 @@ void CHPERedirectionChunk::writeTo(uint8_t *buf) const {
   }
 }
 
+ImportThunkChunkARM64EC::ImportThunkChunkARM64EC(ImportFile *file)
+    : ImportThunkChunk(file->ctx, file->impSym), file(file) {}
+
+void ImportThunkChunkARM64EC::writeTo(uint8_t *buf) const {
+  memcpy(buf, importThunkARM64EC, sizeof(importThunkARM64EC));
+  applyArm64Addr(buf, file->impSym->getRVA(), rva, 12);
+  applyArm64Ldr(buf + 4, file->impSym->getRVA() & 0xfff);
+
+  // The exit thunk may be missing. This can happen if the application only
+  // references a function by its address (in which case the thunk is never
+  // actually used, but is still required to fill the auxiliary IAT), or in
+  // cases of hand-written assembly calling an imported ARM64EC function (where
+  // the exit thunk is ignored by __icall_helper_arm64ec). In such cases, MSVC
+  // link.exe uses 0 as the RVA.
+  uint32_t exitThunkRVA = exitThunk ? exitThunk->getRVA() : 0;
+  applyArm64Addr(buf + 8, exitThunkRVA, rva + 8, 12);
+  applyArm64Imm(buf + 12, exitThunkRVA & 0xfff, 0);
+
+  Defined *helper = cast<Defined>(file->ctx.config.arm64ECIcallHelper);
+  applyArm64Branch26(buf + 16, helper->getRVA() - rva - 16);
+}
+
 } // namespace lld::coff
diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 30e5b538c352e..28e0fd68ac515 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -544,6 +544,14 @@ static const uint8_t importThunkARM64[] = {
     0x00, 0x02, 0x1f, 0xd6, // br   x16
 };
 
+static const uint32_t importThunkARM64EC[] = {
+    0x9000000b, // adrp    x11, 0x0
+    0xf940016b, // ldr     x11, [x11]
+    0x9000000a, // adrp    x10, 0x0
+    0x9100014a, // add     x10, x10, #0x0
+    0x14000000  // b       0x0
+};
+
 // Windows-specific.
 // A chunk for DLL import jump table entry. In a final output, its
 // contents will be a JMP instruction to some __imp_ symbol.
@@ -599,6 +607,22 @@ class ImportThunkChunkARM64 : public ImportThunkChunk {
   MachineTypes getMachine() const override { return ARM64; }
 };
 
+// ARM64EC __impchk_* thunk implementation.
+// Performs an indirect call to an imported function pointer
+// using the __icall_helper_arm64ec helper function.
+class ImportThunkChunkARM64EC : public ImportThunkChunk {
+public:
+  explicit ImportThunkChunkARM64EC(ImportFile *file);
+  size_t getSize() const override { return sizeof(importThunkARM64EC); };
+  MachineTypes getMachine() const override { return ARM64EC; }
+  void writeTo(uint8_t *buf) const override;
+
+  Defined *exitThunk;
+
+private:
+  ImportFile *file;
+};
+
 class RangeExtensionThunkARM : public NonSectionCodeChunk {
 public:
   explicit RangeExtensionThunkARM(COFFLinkerContext &ctx, Defined *t)
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 947f3fead54e0..738776a971ea3 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -164,6 +164,7 @@ struct Configuration {
   std::set<std::string> delayLoads;
   std::map<std::string, int> dllOrder;
   Symbol *delayLoadHelper = nullptr;
+  Symbol *arm64ECIcallHelper = nullptr;
 
   bool saveTemps = false;
 
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 3ef9fa3f65c6a..a1fe6444991a3 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -1383,6 +1383,11 @@ void LinkerDriver::createECExportThunks() {
   }
 }
 
+void LinkerDriver::pullArm64ECIcallHelper() {
+  if (!ctx.config.arm64ECIcallHelper)
+    ctx.config.arm64ECIcallHelper = addUndefined("__icall_helper_arm64ec");
+}
+
 // In MinGW, if no symbols are chosen to be exported, then all symbols are
 // automatically exported by default. This behavior can be forced by the
 // -export-all-symbols option, so that it happens even when exports are
@@ -2685,7 +2690,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   if (auto *arg = args.getLastArg(OPT_print_symbol_order))
     config->printSymbolOrder = arg->getValue();
 
-  ctx.symtab.initializeEntryThunks();
+  ctx.symtab.initializeECThunks();
 
   // Identify unreferenced COMDAT sections.
   if (config->doGC) {
diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h
index b5cf8e2f18fd4..0c195a7cc3148 100644
--- a/lld/COFF/Driver.h
+++ b/lld/COFF/Driver.h
@@ -101,6 +101,8 @@ class LinkerDriver {
 
   std::unique_ptr<llvm::TarWriter> tar; // for /linkrepro
 
+  void pullArm64ECIcallHelper();
+
 private:
   // Searches a file from search paths.
   std::optional<StringRef> findFileIfNew(StringRef filename);
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index fa2d230075d9d..3dbdf8fe3920d 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -190,6 +190,8 @@ void ObjFile::initializeECThunks() {
         ctx.symtab.addEntryThunk(getSymbol(entry->src), getSymbol(entry->dst));
         break;
       case Arm64ECThunkType::Exit:
+        ctx.symtab.addExitThunk(getSymbol(entry->src), getSymbol(entry->dst));
+        break;
       case Arm64ECThunkType::GuestExit:
         break;
       default:
@@ -1009,6 +1011,20 @@ MachineTypes ImportFile::getMachineType() const {
   return MachineTypes(machine);
 }
 
+ImportThunkChunk *ImportFile::makeImportThunk() {
+  switch (hdr->Machine) {
+  case AMD64:
+    return make<ImportThunkChunkX64>(ctx, impSym);
+  case I386:
+    return make<ImportThunkChunkX86>(ctx, impSym);
+  case ARM64:
+    return make<ImportThunkChunkARM64>(ctx, impSym);
+  case ARMNT:
+    return make<ImportThunkChunkARM>(ctx, impSym);
+  }
+  llvm_unreachable("unknown machine type");
+}
+
 void ImportFile::parse() {
   const auto *hdr =
       reinterpret_cast<const coff_import_header *>(mb.getBufferStart());
@@ -1069,10 +1085,16 @@ void ImportFile::parse() {
   // DLL functions just like regular non-DLL functions.)
   if (hdr->getType() == llvm::COFF::IMPORT_CODE) {
     if (ctx.config.machine != ARM64EC) {
-      thunkSym = ctx.symtab.addImportThunk(name, impSym, hdr->Machine);
+      thunkSym = ctx.symtab.addImportThunk(name, impSym, makeImportThunk());
     } else {
-      thunkSym = ctx.symtab.addImportThunk(name, impSym, AMD64);
+      thunkSym = ctx.symtab.addImportThunk(
+          name, impSym, make<ImportThunkChunkX64>(ctx, impSym));
       // FIXME: Add aux IAT symbols.
+
+      StringRef impChkName = saver().save("__impchk_" + name);
+      impchkThunk = make<ImportThunkChunkARM64EC>(this);
+      ctx.symtab.addImportThunk(impChkName, impSym, impchkThunk);
+      ctx.driver.pullArm64ECIcallHelper();
     }
   }
 }
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index 8b3303a8d87f4..3b837017e1c21 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -55,6 +55,8 @@ class Defined;
 class DefinedImportData;
 class DefinedImportThunk;
 class DefinedRegular;
+class ImportThunkChunk;
+class ImportThunkChunkARM64EC;
 class SectionChunk;
 class Symbol;
 class Undefined;
@@ -348,10 +350,12 @@ class ImportFile : public InputFile {
 
   DefinedImportData *impSym = nullptr;
   Symbol *thunkSym = nullptr;
+  ImportThunkChunkARM64EC *impchkThunk = nullptr;
   std::string dllName;
 
 private:
   void parse() override;
+  ImportThunkChunk *makeImportThunk();
 
 public:
   StringRef externalName;
diff --git a/lld/COFF/MarkLive.cpp b/lld/COFF/MarkLive.cpp
index 06079a98f2d00..8af58780e1358 100644
--- a/lld/COFF/MarkLive.cpp
+++ b/lld/COFF/MarkLive.cpp
@@ -43,13 +43,23 @@ void markLive(COFFLinkerContext &ctx) {
     worklist.push_back(c);
   };
 
-  auto addSym = [&](Symbol *b) {
-    if (auto *sym = dyn_cast<DefinedRegular>(b))
+  std::function<void(Symbol *)> addSym;
+
+  auto addImportFile = [&](ImportFile *file) {
+    file->live = true;
+    if (file->impchkThunk && file->impchkThunk->exitThunk)
+      addSym(file->impchkThunk->exitThunk);
+  };
+
+  addSym = [&](Symbol *b) {
+    if (auto *sym = dyn_cast<DefinedRegular>(b)) {
       enqueue(sym->getChunk());
-    else if (auto *sym = dyn_cast<DefinedImportData>(b))
-      sym->file->live = true;
-    else if (auto *sym = dyn_cast<DefinedImportThunk>(b))
-      sym->wrappedSym->file->live = sym->wrappedSym->file->thunkLive = true;
+    } else if (auto *sym = dyn_cast<DefinedImportData>(b)) {
+      addImportFile(sym->file);
+    } else if (auto *sym = dyn_cast<DefinedImportThunk>(b)) {
+      addImportFile(sym->wrappedSym->file);
+      sym->wrappedSym->file->thunkLive = true;
+    }
   };
 
   // Add GC root chunks.
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index bb7583bb9a7df..a6575ecac3bb4 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -557,7 +557,11 @@ void SymbolTable::addEntryThunk(Symbol *from, Symbol *to) {
   entryThunks.push_back({from, to});
 }
 
-void SymbolTable::initializeEntryThunks() {
+void SymbolTable::addExitThunk(Symbol *from, Symbol *to) {
+  exitThunks[from] = to;
+}
+
+void SymbolTable::initializeECThunks() {
   for (auto it : entryThunks) {
     auto *to = dyn_cast<Defined>(it.second);
     if (!to)
@@ -573,6 +577,16 @@ void SymbolTable::initializeEntryThunks() {
     }
     from->getChunk()->setEntryThunk(to);
   }
+
+  for (ImportFile *file : ctx.importFileInstances) {
+    if (!file->impchkThunk)
+      continue;
+
+    Symbol *sym = exitThunks.lookup(file->thunkSym);
+    if (!sym)
+      sym = exitThunks.lookup(file->impSym);
+    file->impchkThunk->exitThunk = dyn_cast_or_null<Defined>(sym);
+  }
 }
 
 Symbol *SymbolTable::addUndefined(StringRef name, InputFile *f,
@@ -784,11 +798,11 @@ DefinedImportData *SymbolTable::addImportData(StringRef n, ImportFile *f) {
 }
 
 Symbol *SymbolTable::addImportThunk(StringRef name, DefinedImportData *id,
-                                    uint16_t machine) {
+                                    ImportThunkChunk *chunk) {
   auto [s, wasInserted] = insert(name, nullptr);
   s->isUsedInRegularObj = true;
   if (wasInserted || isa<Undefined>(s) || s->isLazy()) {
-    replaceSymbol<DefinedImportThunk>(s, ctx, name, id, machine);
+    replaceSymbol<DefinedImportThunk>(s, ctx, name, id, chunk);
     return s;
   }
 
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index 51c6c79ec1446..13e151e3a8c50 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -28,6 +28,7 @@ class COFFLinkerContext;
 class Defined;
 class DefinedAbsolute;
 class DefinedRegular;
+class ImportThunkChunk;
 class LazyArchive;
 class SectionChunk;
 class Symbol;
@@ -104,10 +105,11 @@ class SymbolTable {
                     CommonChunk *c = nullptr);
   DefinedImportData *addImportData(StringRef n, ImportFile *f);
   Symbol *addImportThunk(StringRef name, DefinedImportData *s,
-                         uint16_t machine);
+                         ImportThunkChunk *chunk);
   void addLibcall(StringRef name);
   void addEntryThunk(Symbol *from, Symbol *to);
-  void initializeEntryThunks();
+  void addExitThunk(Symbol *from, Symbol *to);
+  void initializeECThunks();
 
   void reportDuplicate(Symbol *existing, InputFile *newFile,
                        SectionChunk *newSc = nullptr,
@@ -140,6 +142,7 @@ class SymbolTable {
   std::unique_ptr<BitcodeCompiler> lto;
   bool ltoCompilationDone = false;
   std::vector<std::pair<Symbol *, Symbol *>> entryThunks;
+  llvm::DenseMap<Symbol *, Symbol *> exitThunks;
 
   COFFLinkerContext &ctx;
 };
diff --git a/lld/COFF/Symbols.cpp b/lld/COFF/Symbols.cpp
index b098abb80d6f1..5f4d797f74a2d 100644
--- a/lld/COFF/Symbols.cpp
+++ b/lld/COFF/Symbols.cpp
@@ -107,22 +107,10 @@ COFFSymbolRef DefinedCOFF::getCOFFSymbol() {
 
 uint64_t DefinedAbsolute::getRVA() { return va - ctx.config.imageBase; }
 
-static Chunk *makeImportThunk(COFFLinkerContext &ctx, DefinedImportData *s,
-                              uint16_t machine) {
-  if (machine == AMD64)
-    return make<ImportThunkChunkX64>(ctx, s);
-  if (machine == I386)
-    return make<ImportThunkChunkX86>(ctx, s);
-  if (machine == ARM64)
-    return make<ImportThunkChunkARM64>(ctx, s);
-  assert(machine == ARMNT);
-  return make<ImportThunkChunkARM>(ctx, s);
-}
-
 DefinedImportThunk::DefinedImportThunk(COFFLinkerContext &ctx, StringRef name,
-                                       DefinedImportData *s, uint16_t machine)
-    : Defined(DefinedImportThunkKind, name), wrappedSym(s),
-      data(makeImportThunk(ctx, s, machine)) {}
+                                       DefinedImportData *s,
+                                       ImportThunkChunk *chunk)
+    : Defined(DefinedImportThunkKind, name), wrappedSym(s), data(chunk) {}
 
 Defined *Undefined::getWeakAlias() {
   // A weak alias may be a weak alias to another symbol, so check recursively.
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index c427a062dc82b..724330e4bab95 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -388,7 +388,7 @@ class DefinedImportData : public Defined {
 class DefinedImportThunk : public Defined {
 public:
   DefinedImportThunk(COFFLinkerContext &ctx, StringRef name,
-                     DefinedImportData *s, uint16_t machine);
+                     DefinedImportData *s, ImportThunkChunk *chunk);
 
   static bool classof(const Symbol *s) {
     return s->kind() == DefinedImportThunkKind;
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 3cb9b3b512ead..b589a16bca32a 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1248,6 +1248,8 @@ void Writer::appendImportThunks() {
     DefinedImportThunk *thunk = cast<DefinedImportThunk>(file->thunkSym);
     if (file->thunkLive)
       textSec->addChunk(thunk->getChunk());
+    if (file->impchkThunk)
+      textSec->addChunk(file->impchkThunk);
   }
 
   if (!delayIdata.empty()) {
diff --git a/lld/test/COFF/Inputs/loadconfig-arm64ec.s b/lld/test/COFF/Inputs/loadconfig-arm64ec.s
index 78e7fba43a0a4..75dc6105301d0 100644
--- a/lld/test/COFF/Inputs/loadconfig-arm64ec.s
+++ b/lld/test/COFF/Inputs/loadconfig-arm64ec.s
@@ -30,6 +30,8 @@ __os_arm64x_dispatch_ret:
         .xword 0
 __os_arm64x_check_call:
         .xword 0
+        .globl __os_arm64x_dispatch_icall
+__os_arm64x_dispatch_icall:
 __os_arm64x_check_icall:
         .xword 0
 __os_arm64x_get_x64_information:
diff --git a/lld/test/COFF/arm64ec-import.test b/lld/test/COFF/arm64ec-import.test
index b1c47d785e445..44a84c09e11a3 100644
--- a/lld/test/COFF/arm64ec-import.test
+++ b/lld/test/COFF/arm64ec-import.test
@@ -2,17 +2,19 @@ REQUIRES: aarch64, x86
 RUN: split-file %s %t.dir && cd %t.dir
 
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows icall.s -o icall.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows hybmp.s -o hybmp.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
 RUN: llvm-lib -machine:arm64ec -def:test.def -out:test-arm64ec.lib
 RUN: llvm-lib -machine:arm64ec -def:test2.def -out:test2-arm64ec.lib
 RUN: llvm-lib -machine:x64 -def:test.def -out:test-x86_64.lib
 
 Link using ARM64EC import library:
-RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj \
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj icall.obj hybmp.obj \
 RUN:          test.obj test-arm64ec.lib test2-arm64ec.lib
 
 Link using x86_64 import library:
-RUN: lld-link -machine:arm64ec -dll -noentry -out:out2.dll loadconfig-arm64ec.obj \
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out2.dll loadconfig-arm64ec.obj icall.obj hybmp.obj \
 RUN:          test.obj test-x86_64.lib test2-arm64ec.lib
 
 RUN: llvm-readobj --coff-imports out.dll | FileCheck --check-prefix=IMPORTS %s
@@ -20,7 +22,7 @@ RUN: llvm-readobj --coff-imports out2.dll | FileCheck --check-prefix=IMPORTS %s
 IMPORTS:      Import {
 IMPORTS-NEXT:   Name: test.dll
 IMPORTS-NEXT:   ImportLookupTableRVA:
-IMPORTS-NEXT:   ImportAddressTableRVA: 0x2000
+IMPORTS-NEXT:   ImportAddressTableRVA: 0x3000
 IMPORTS-NEXT:   Symbol: data (0)
 IMPORTS-NEXT:   Symbol: func (0)
 IMPORTS-NEXT:   Symbol: func2 (0)
@@ -28,24 +30,45 @@ IMPORTS-NEXT: }
 IMPORTS-NEXT: Import {
 IMPORTS-NEXT:   Name: test2.dll
 IMPORTS-NEXT:   ImportLookupTableRVA:
-IMPORTS-NEXT:   ImportAddressTableRVA: 0x2020
+IMPORTS-NEXT:   ImportAddressTableRVA: 0x3020
 IMPORTS-NEXT:   Symbol: t2func (0)
 IMPORTS-NEXT: }
 
 RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s
 RUN: llvm-objdump -d out2.dll | FileCheck --check-prefix=DISASM %s
 
-DISASM:      0000000180001000 <.text>:
-DISASM-NEXT: 180001000: ff 25 02 10 00 00            jmpq    *0x1002(%rip)           # 0x180002008
+DISASM:      180001000: 52800000     mov     w0, #0x0                // =0
+DISASM-NEXT: 180001004: d65f03c0     ret
+DISASM-NEXT: 180001008: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 18000100c: f940056b     ldr     x11, [x11, #0x8]
+DISASM-NEXT: 180001010: 9000000a     adrp    x10, 0x180001000 <.text>
+DISASM-NEXT: 180001014: 9101114a     add     x10, x10, #0x44
+DISASM-NEXT: 180001018: 17fffffa     b       0x180001000 <.text>
+DISASM-NEXT: 18000101c: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 180001020: f940096b     ldr     x11, [x11, #0x10]
+DISASM-NEXT: 180001024: f0ffffea     adrp    x10, 0x180000000
+DISASM-NEXT: 180001028: 9100014a     add     x10, x10, #0x0
+DISASM-NEXT: 18000102c: 17fffff5     b       0x180001000 <.text>
+DISASM-NEXT: 180001030: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 180001034: f940116b     ldr     x11, [x11, #0x20]
+DISASM-NEXT: 180001038: 9000000a     adrp    x10, 0x180001000 <.text>
+DISASM-NEXT: 18000103c: 9101314a     add     x10, x10, #0x4c
+DISASM-NEXT: 180001040: 17fffff0     b       0x180001000 <.text>
+DISASM-NEXT: 180001044: 52800020     mov     w0, #0x1                // =1
+DISASM-NEXT: 180001048: d65f03c0     ret
+DISASM-NEXT: 18000104c: 52800040     mov     w0, #0x2                // =2
+DISASM-NEXT: 180001050: d65f03c0     ret
+DISASM-NEXT:                 ...
+DISASM-NEXT: 180002000: ff 25 02 10 00 00            jmpq    *0x1002(%rip)           # 0x180003008
 
 RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s
 RUN: llvm-readobj --hex-dump=.test out2.dll | FileCheck --check-prefix=TESTSEC %s
-TESTSEC:      0x180005000 08200000 00200000 10200000 20200000
-TESTSEC-NEXT: 0x180005010 00100000
+TESTSEC:      0x180006000 08300000 00300000 10300000 20300000
+TESTSEC-NEXT: 0x180006010 08100000 1c100000          00200000
 
 RUN: llvm-readobj --headers out.dll | FileCheck -check-prefix=HEADERS %s
-HEADERS:  LoadConfigTableRVA: 0x3008
-HEADERS:  IATRVA: 0x2000
+HEADERS:  LoadConfigTableRVA: 0x4010
+HEADERS:  IATRVA: 0x3000
 HEADERS:  IATSize: 0x1000
 
 #--- test.s
@@ -57,8 +80,49 @@ arm64ec_data_sym:
     .rva __imp_data
     .rva __imp_func2
     .rva __imp_t2func
+    .rva __impchk_func
+    .rva __impchk_func2
     .rva func
 
+#--- icall.s
+    .text
+    .globl __icall_helper_arm64ec
+    .p2align 2, 0x0
+__icall_helper_arm64ec:
+    mov w0, #0
+    ret
+
+#--- hybmp.s
+    .section .hybmp$x, "yi"
+    // __imp_func exit thunk is ignored when func is defined as well
+    .symidx __imp_func
+    .symidx dead_exit_thunk
+    .word 4
+    .symidx func
+    .symidx func_exit_thunk
+    .word 4
+    .symidx __imp_t2func
+    .symidx t2func_exit_thunk
+    .word 4
+
+    .section .wowthk$aa,"xr",discard,func_exit_thunk
+    .globl func_exit_thunk
+func_exit_thunk:
+    mov w0, #1
+    ret
+
+    .section .wowthk$aa,"xr",discard,t2func_exit_thunk
+    .globl t2func_exit_thunk
+t2func_exit_thunk:
+    mov w0, #2
+    ret
+
+    .section .wowthk$aa,"xr",discard,dead_exit_thunk
+    .globl dead_exit_thunk
+dead_exit_thunk:
+    mov w0, #0xdead
+    ret
+
 #--- test.def
 NAME test.dll
 EXPORTS
diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py
index 4ea9a86c1d5fc..564918c58b6dd 100644
--- a/lldb/packages/Python/lldbsuite/test/builders/builder.py
+++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py
@@ -1,10 +1,12 @@
 import os
+import pathlib
 import platform
 import subprocess
 import sys
 import itertools
 
 import lldbsuite.test.lldbtest as lldbtest
+import lldbsuite.test.lldbplatformutil as lldbplatformutil
 import lldbsuite.test.lldbutil as lldbutil
 from lldbsuite.test import configuration
 from lldbsuite.test_event import build_exception
@@ -96,17 +98,101 @@ def getArchSpec(self, architecture):
         """
         return ["ARCH=" + architecture] if architecture else []
 
-    def getCCSpec(self, compiler):
+    def getToolchainSpec(self, compiler):
         """
-        Helper function to return the key-value string to specify the compiler
+        Helper function to return the key-value strings to specify the toolchain
         used for the make system.
         """
         cc = compiler if compiler else None
         if not cc and configuration.compiler:
             cc = configuration.compiler
-        if cc:
-            return ['CC="%s"' % cc]
-        return []
+
+        if not cc:
+            return []
+
+        cc = cc.strip()
+        cc_path = pathlib.Path(cc)
+
+        # We can get CC compiler string in the following formats:
+        #  [<tool>] <compiler>    - such as 'xrun clang', 'xrun /usr/bin/clang' & etc
+        #
+        # Where <compiler> could contain the following parts:
+        #   <simple-name>[.<exe-ext>]                           - sucn as 'clang', 'clang.exe' ('clang-cl.exe'?)
+        #   <target-triple>-<simple-name>[.<exe-ext>]           - such as 'armv7-linux-gnueabi-gcc'
+        #   <path>/<simple-name>[.<exe-ext>]                    - such as '/usr/bin/clang', 'c:\path\to\compiler\clang,exe'
+        #   <path>/<target-triple>-<simple-name>[.<exe-ext>]    - such as '/usr/bin/clang', 'c:\path\to\compiler\clang,exe'
+
+        cc_ext = cc_path.suffix
+        # Compiler name without extension
+        cc_name = cc_path.stem.split(" ")[-1]
+
+        # A kind of compiler (canonical name): clang, gcc, cc & etc.
+        cc_type = cc_name
+        # A triple prefix of compiler name: <armv7-none-linux-gnu->gcc
+        cc_prefix = ""
+        if not "clang-cl" in cc_name and not "llvm-gcc" in cc_name:
+            cc_name_parts = cc_name.split("-")
+            cc_type = cc_name_parts[-1]
+            if len(cc_name_parts) > 1:
+                cc_prefix = "-".join(cc_name_parts[:-1]) + "-"
+
+        # A kind of C++ compiler.
+        cxx_types = {
+            "icc": "icpc",
+            "llvm-gcc": "llvm-g++",
+            "gcc": "g++",
+            "cc": "c++",
+            "clang": "clang++",
+        }
+        cxx_type = cxx_types.get(cc_type, cc_type)
+
+        cc_dir = cc_path.parent
+
+        def getToolchainUtil(util_name):
+            return cc_dir / (cc_prefix + util_name + cc_ext)
+
+        cxx = getToolchainUtil(cxx_type)
+
+        util_names = {
+            "OBJCOPY": "objcopy",
+            "STRIP": "strip",
+            "ARCHIVER": "ar",
+            "DWP": "dwp",
+        }
+        utils = []
+
+        if not lldbplatformutil.platformIsDarwin():
+            if cc_type in ["clang", "cc", "gcc"]:
+                util_paths = {}
+                # Assembly a toolchain side tool cmd based on passed CC.
+                for var, name in util_names.items():
+                    # Do not override explicity specified tool from the cmd line.
+                    if not os.getenv(var):
+                        util_paths[var] = getToolchainUtil(name)
+                    else:
+                        util_paths[var] = os.getenv(var)
+                utils.extend(["AR=%s" % util_paths["ARCHIVER"]])
+
+                # Look for llvm-dwp or gnu dwp
+                if not lldbutil.which(util_paths["DWP"]):
+                    util_paths["DWP"] = getToolchainUtil("llvm-dwp")
+                if not lldbutil.which(util_paths["DWP"]):
+                    util_paths["DWP"] = lldbutil.which("llvm-dwp")
+                if not util_paths["DWP"]:
+                    util_paths["DWP"] = lldbutil.which("dwp")
+                    if not util_paths["DWP"]:
+                        del util_paths["DWP"]
+
+                for var, path in util_paths.items():
+                    utils.append("%s=%s" % (var, path))
+        else:
+            utils.extend(["AR=%slibtool" % os.getenv("CROSS_COMPILE", "")])
+
+        return [
+            "CC=%s" % cc,
+            "CC_TYPE=%s" % cc_type,
+            "CXX=%s" % cxx,
+        ] + utils
 
     def getSDKRootSpec(self):
         """
@@ -178,7 +264,7 @@ def getBuildCommand(
             make_targets,
             self.getArchCFlags(architecture),
             self.getArchSpec(architecture),
-            self.getCCSpec(compiler),
+            self.getToolchainSpec(compiler),
             self.getExtraMakeArgs(),
             self.getSDKRootSpec(),
             self.getModuleCacheSpec(),
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 1ba3f843e87cf..f81db9bc06d8a 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -102,15 +102,22 @@ endif
 # If you change the defaults of CC, be sure to also change it in the file
 # test/builders/builder_base.py, which provides a Python way to return the
 # value of the make variable CC -- getCompiler().
-#
-# See also these functions:
-#   o cxx_compiler
-#   o cxx_linker
 #----------------------------------------------------------------------
 ifeq "$(CC)" ""
 $(error "C compiler is not specified. Please run tests through lldb-dotest or lit")
 endif
 
+# Always override the linker. Assign already normalized CC.
+override LD := $(CC)
+# A kind of linker. It always gets retrieved from CC.
+override LDC := $(CC_TYPE)
+
+ifeq "$(HOST_OS)" "Windows_NT"
+       # This function enframes the full path with the platform specific quotes. This is necessary to run the c++ executable
+       # properly under 'sh' on Windows host (prevent the path breakage because of Windows style path separators).
+       override CXX := $(QUOTE)$(CXX)$(QUOTE)
+endif
+
 #----------------------------------------------------------------------
 # Handle SDKROOT for the cross platform builds.
 #----------------------------------------------------------------------
@@ -147,10 +154,8 @@ ifeq "$(OS)" "Darwin"
 	DS := $(DSYMUTIL)
 	DSFLAGS := $(DSFLAGS_EXTRAS)
 	DSYM = $(EXE).dSYM
-	AR := $(CROSS_COMPILE)libtool
 	ARFLAGS := -static -o
 else
-	AR := $(CROSS_COMPILE)ar
 	# On non-Apple platforms, -arch becomes -m
 	ARCHFLAG := -m
 
@@ -213,7 +218,7 @@ endif
 LIMIT_DEBUG_INFO_FLAGS =
 NO_LIMIT_DEBUG_INFO_FLAGS =
 MODULE_DEBUG_INFO_FLAGS =
-ifneq (,$(findstring clang,$(CC)))
+ifeq ($(CC_TYPE), clang)
    LIMIT_DEBUG_INFO_FLAGS += -flimit-debug-info
    NO_LIMIT_DEBUG_INFO_FLAGS += -fno-limit-debug-info
    MODULE_DEBUG_INFO_FLAGS += -gmodules
@@ -279,7 +284,6 @@ endif
 
 CFLAGS += $(CFLAGS_EXTRAS)
 CXXFLAGS += -std=c++11 $(CFLAGS) $(ARCH_CXXFLAGS)
-LD = $(CC)
 # Copy common options to the linker flags (dwarf, arch. & etc).
 # Note: we get some 'garbage' options for linker here (such as -I, --isystem & etc).
 LDFLAGS += $(CFLAGS)
@@ -312,61 +316,6 @@ ifneq "$(DYLIB_NAME)" ""
 	endif
 endif
 
-# Function that returns the counterpart C++ compiler, given $(CC) as arg.
-cxx_compiler_notdir = $(if $(findstring icc,$(1)), \
-			$(subst icc,icpc,$(1)), \
-			$(if $(findstring llvm-gcc,$(1)), \
-				$(subst llvm-gcc,llvm-g++,$(1)), \
-				$(if $(findstring gcc,$(1)), \
-					$(subst gcc,g++,$(1)), \
-					$(subst cc,c++,$(1)))))
-cxx_compiler = $(if $(findstring /,$(1)),$(join $(dir $(1)), $(call cxx_compiler_notdir,$(notdir $(1)))),$(call cxx_compiler_notdir,$(1)))
-
-# Function that returns the C++ linker, given $(CC) as arg.
-cxx_linker_notdir = $(if $(findstring icc,$(1)), \
-			$(subst icc,icpc,$(1)), \
-			$(if $(findstring llvm-gcc,$(1)), \
-				$(subst llvm-gcc,llvm-g++,$(1)), \
-				$(if $(findstring gcc,$(1)), \
-					$(subst gcc,g++,$(1)), \
-					$(subst cc,c++,$(1)))))
-cxx_linker = $(if $(findstring /,$(1)),$(join $(dir $(1)), $(call cxx_linker_notdir,$(notdir $(1)))),$(call cxx_linker_notdir,$(1)))
-
-ifneq "$(OS)" "Darwin"
-	CLANG_OR_GCC := $(strip $(if $(findstring clang,$(CC)), \
-				$(findstring clang,$(CC)), \
-				$(if $(findstring gcc,$(CC)), \
-					$(findstring gcc,$(CC)), \
-					cc)))
-
-	CC_LASTWORD := $(strip $(lastword $(subst -, ,$(CC))))
-
-	replace_with = $(strip $(if $(findstring $(3),$(CC_LASTWORD)), \
-			$(subst $(3),$(1),$(2)), \
-			$(subst $(3),$(1),$(subst -$(CC_LASTWORD),,$(2)))))
-
-	ifeq "$(notdir $(CC))" "$(CC)"
-		replace_cc_with = $(call replace_with,$(1),$(CC),$(CLANG_OR_GCC))
-	else
-		replace_cc_with = $(join $(dir $(CC)),$(call replace_with,$(1),$(notdir $(CC)),$(CLANG_OR_GCC)))
-	endif
-
-	OBJCOPY ?= $(call replace_cc_with,objcopy)
-	ARCHIVER ?= $(call replace_cc_with,ar)
-	# Look for llvm-dwp or gnu dwp
-	DWP ?= $(call replace_cc_with,llvm-dwp)
-	ifeq ($(wildcard $(DWP)),)
-		DWP = $(call replace_cc_with,dwp)
-		ifeq ($(wildcard $(DWP)),)
-			DWP = $(shell command -v llvm-dwp 2> /dev/null)
-			ifeq ($(wildcard $(DWP)),)
-				DWP = $(shell command -v dwp 2> /dev/null)
-			endif
-		endif
-	endif
-	override AR = $(ARCHIVER)
-endif
-
 ifdef PIE
 	LDFLAGS += -pie
 endif
@@ -375,7 +324,7 @@ endif
 # Windows specific options
 #----------------------------------------------------------------------
 ifeq "$(OS)" "Windows_NT"
-	ifneq (,$(findstring clang,$(CC)))
+	ifeq ($(CC_TYPE), clang)
 		# Clang for Windows doesn't support C++ Exceptions
 		CXXFLAGS += -fno-exceptions
 		CXXFLAGS += -D_HAS_EXCEPTIONS=0
@@ -420,7 +369,7 @@ endif
 
 ifeq (1,$(USE_LIBSTDCPP))
 	# Clang requires an extra flag: -stdlib=libstdc++
-	ifneq (,$(findstring clang,$(CC)))
+	ifeq ($(CC_TYPE), clang)
 		# Force clang looking for the gcc's headers at specific rootfs folder.
 		CXXFLAGS += -stdlib=libstdc++ $(GCC_TOOLCHAIN_FLAGS)
 		LDFLAGS += -stdlib=libstdc++ $(GCC_TOOLCHAIN_FLAGS)
@@ -458,7 +407,7 @@ ifeq (1, $(USE_SYSTEM_STDLIB))
         CXXFLAGS += -nostdlib++ -nostdinc++ -cxx-isystem $(SDKROOT)/usr/include/c++/v1
         LDFLAGS += -L$(SDKROOT)/usr/lib -Wl,-rpath,$(SDKROOT)/usr/lib -lc++
     else
-        ifneq (,$(findstring clang,$(CC)))
+        ifeq ($(CC_TYPE),clang)
             # Force clang looking for the gcc's headers at specific rootfs folder.
             CXXFLAGS += $(GCC_TOOLCHAIN_FLAGS)
             LDFLAGS += $(GCC_TOOLCHAIN_FLAGS)
@@ -485,8 +434,6 @@ DYLIB_OBJECTS +=$(strip $(DYLIB_C_SOURCES:.c=.o))
 DYLIB_OBJECTS +=$(strip $(DYLIB_OBJC_SOURCES:.m=.o))
 ifneq "$(strip $(DYLIB_CXX_SOURCES))" ""
 	DYLIB_OBJECTS +=$(strip $(patsubst %.mm, %.o, $(DYLIB_CXX_SOURCES:.cpp=.o)))
-	CXX = $(call cxx_compiler,$(CC))
-	LD = $(call cxx_linker,$(CC))
 endif
 
 #----------------------------------------------------------------------
@@ -509,8 +456,6 @@ endif
 #----------------------------------------------------------------------
 ifneq "$(strip $(CXX_SOURCES))" ""
 	OBJECTS +=$(strip $(CXX_SOURCES:.cpp=.o))
-	CXX = $(call cxx_compiler,$(CC))
-	LD = $(call cxx_linker,$(CC))
 endif
 
 #----------------------------------------------------------------------
@@ -526,19 +471,18 @@ endif
 #----------------------------------------------------------------------
 ifneq "$(strip $(OBJCXX_SOURCES))" ""
 	OBJECTS +=$(strip $(OBJCXX_SOURCES:.mm=.o))
-	CXX = $(call cxx_compiler,$(CC))
-	LD = $(call cxx_linker,$(CC))
 	ifeq "$(findstring lobjc,$(LDFLAGS))" ""
 		LDFLAGS +=-lobjc
 	endif
 endif
 
-ifeq ($(findstring clang, $(CXX)), clang)
+ifeq ($(CC_TYPE), clang)
 	CXXFLAGS += --driver-mode=g++
 endif
 
 ifneq "$(CXX)" ""
-	ifeq ($(findstring clang, $(LD)), clang)
+	# Specify the driver mode parameter if we use clang as the linker.
+	ifeq ($(LDC), clang)
 		LDFLAGS += --driver-mode=g++
 	endif
 endif
diff --git a/lldb/test/API/commands/statistics/basic/TestStats.py b/lldb/test/API/commands/statistics/basic/TestStats.py
index 03ec169344705..a0a9eeb649320 100644
--- a/lldb/test/API/commands/statistics/basic/TestStats.py
+++ b/lldb/test/API/commands/statistics/basic/TestStats.py
@@ -921,6 +921,7 @@ def test_order_of_options_do_not_matter(self):
                 f"The order of options '{options[0]}' and '{options[1]}' should not matter",
             )
 
+    @skipIfWindows
     def test_summary_statistics_providers(self):
         """
         Test summary timing statistics is included in statistics dump when
@@ -960,6 +961,7 @@ def test_summary_statistics_providers(self):
         self.assertIn("'totalTime':", summary_provider_str)
         self.assertIn("'type': 'python'", summary_provider_str)
 
+    @skipIfWindows
     def test_summary_statistics_providers_vec(self):
         """
         Test summary timing statistics is included in statistics dump when
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile b/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile
index 2c00681fa2280..778d3e58ab56f 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile
@@ -1,6 +1,6 @@
 CXX_SOURCES := main.cpp
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile
index 9645fd9cc8dfb..304633c2dca1f 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile
@@ -1,6 +1,6 @@
 C_SOURCES := main.c
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile b/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile
index 2c00681fa2280..778d3e58ab56f 100644
--- a/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile
@@ -1,6 +1,6 @@
 CXX_SOURCES := main.cpp
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/cpp/Makefile b/lldb/test/API/functionalities/breakpoint/cpp/Makefile
index 66108b79e7fe0..3b4be01d551f4 100644
--- a/lldb/test/API/functionalities/breakpoint/cpp/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/cpp/Makefile
@@ -1,7 +1,7 @@
 CXX_SOURCES := main.cpp
 CXXFLAGS_EXTRAS := -std=c++14
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile b/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile
index 9645fd9cc8dfb..304633c2dca1f 100644
--- a/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile
@@ -1,6 +1,6 @@
 C_SOURCES := main.c
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile
index 9645fd9cc8dfb..304633c2dca1f 100644
--- a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile
@@ -1,6 +1,6 @@
 C_SOURCES := main.c
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile
index 2c00681fa2280..778d3e58ab56f 100644
--- a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile
@@ -1,6 +1,6 @@
 CXX_SOURCES := main.cpp
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile b/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile
index 2c00681fa2280..778d3e58ab56f 100644
--- a/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile
@@ -1,6 +1,6 @@
 CXX_SOURCES := main.cpp
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py
index a0d6802b3a506..c1cd9556c5ef3 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py
@@ -16,12 +16,12 @@ def appkit_tester_impl(self, commands, use_constant_classes):
             self.build()
         else:
             disable_constant_classes = {
-                "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
                 "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
                 + "-fno-constant-nsarray-literals "
                 + "-fno-constant-nsdictionary-literals",
             }
-            self.build(dictionary=disable_constant_classes)
+            # FIXME: Remove compiler when flags are available upstream.
+            self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.appkit_common_data_formatters_command()
         commands()
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
index 98438742a11ca..6b5bcf8a7df2f 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
@@ -100,6 +100,16 @@ def cleanup():
             "s", result_type=ns + "::wstring", result_summary='L"hello world! מזל טוב!"'
         )
 
+        self.expect_expr(
+            "q", result_type=ns + "::string", result_summary='"hello world"'
+        )
+
+        self.expect_expr(
+            "Q",
+            result_type=ns + "::string",
+            result_summary='"quite a long std::strin with lots of info inside it"',
+        )
+
         self.expect(
             "frame variable",
             substrs=[
diff --git a/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py b/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py
index 9ac41d67eb9ab..e1d7e42bdd1a9 100644
--- a/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py
+++ b/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py
@@ -26,12 +26,12 @@ def test_rdar11988289_with_run_command(self):
     def test_rdar11988289_with_run_command_no_const(self):
         """Test that NSDictionary reports its synthetic children properly."""
         disable_constant_classes = {
-            "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
             "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
             + "-fno-constant-nsarray-literals "
             + "-fno-constant-nsdictionary-literals",
         }
-        self.build(dictionary=disable_constant_classes)
+        # FIXME: Remove when flags are available upstream.
+        self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.run_tests()
 
     def run_tests(self):
diff --git a/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py b/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py
index 053ec0ee9757e..1037e75c17eb3 100644
--- a/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py
+++ b/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py
@@ -26,12 +26,12 @@ def test_rdar12529957_with_run_command(self):
     def test_rdar12529957_with_run_command_no_const(self):
         """Test that NSSet reports its synthetic children properly."""
         disable_constant_classes = {
-            "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
             "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
             + "-fno-constant-nsarray-literals "
             + "-fno-constant-nsdictionary-literals",
         }
-        self.build(dictionary=disable_constant_classes)
+        # FIXME: Remove compiler when flags are available upstream.
+        self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.run_tests()
 
     def run_tests(self):
diff --git a/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py b/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py
index fff37829cd20d..db86f48f8ec1f 100644
--- a/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py
+++ b/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py
@@ -20,13 +20,13 @@ def test_print_array(self):
     def test_print_array_no_const(self):
         """Test that expr -O -Z works"""
         disable_constant_classes = {
-            "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
             "USE_SYSTEM_STDLIB": "1",  # See above.
             "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
             + "-fno-constant-nsarray-literals "
             + "-fno-constant-nsdictionary-literals",
         }
-        self.build(dictionary=disable_constant_classes)
+        # FIXME: Remove compiler when flags are available upstream.
+        self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.printarray_data_formatter_commands()
 
     def setUp(self):
diff --git a/lldb/test/API/functionalities/inline-stepping/Makefile b/lldb/test/API/functionalities/inline-stepping/Makefile
index 362b89d7f995b..bf646c7b7db33 100644
--- a/lldb/test/API/functionalities/inline-stepping/Makefile
+++ b/lldb/test/API/functionalities/inline-stepping/Makefile
@@ -1,6 +1,6 @@
 CXX_SOURCES := calling.cpp
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt b/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt
index 7096efabdcfe1..d594b585b2d5f 100644
--- a/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt
@@ -19,6 +19,7 @@
 # to generate a Minidump when the binary crashes/requests such.
 #
 CC=g++
+CC_TYPE=gcc
 FLAGS=-g --std=c++11
 INCLUDE=-I$HOME/breakpad/src/src/
 LINK=-L. -lbreakpad -lpthread -nostdlib -lc -lstdc++ -lgcc_s -fno-exceptions
diff --git a/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py b/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py
index 14bfc322979b3..a7d6d9d155efc 100644
--- a/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py
+++ b/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py
@@ -12,12 +12,12 @@ def test_ordered_set(self):
     @skipUnlessDarwin
     def test_ordered_set_no_const(self):
         disable_constant_classes = {
-            "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
             "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
             + "-fno-constant-nsarray-literals "
             + "-fno-constant-nsdictionary-literals",
         }
-        self.build(dictionary=disable_constant_classes)
+        # FIXME: Remove when flags are available upstream.
+        self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.run_tests()
 
     def run_tests(self):
diff --git a/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py b/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py
index 68c0af76b8e3b..8debe731dfe1a 100644
--- a/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py
+++ b/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py
@@ -28,12 +28,12 @@ def test_single_entry_dict(self):
     )  # bug in NSDictionary formatting on watchos
     def test_single_entry_dict_no_const(self):
         disable_constant_classes = {
-            "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
             "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
             + "-fno-constant-nsarray-literals "
             + "-fno-constant-nsdictionary-literals",
         }
-        self.build(dictionary=disable_constant_classes)
+        # FIXME: Remove compiler when flags are available upstream.
+        self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.run_tests()
 
     def run_tests(self):
diff --git a/lldb/test/API/macosx/macCatalyst/Makefile b/lldb/test/API/macosx/macCatalyst/Makefile
index 3f084968a2d57..ef17d89d2372c 100644
--- a/lldb/test/API/macosx/macCatalyst/Makefile
+++ b/lldb/test/API/macosx/macCatalyst/Makefile
@@ -7,6 +7,7 @@ USE_SYSTEM_STDLIB := 1
 
 # FIXME: rdar://problem/54986190
 # There is a Clang driver change missing on llvm.org.
+override CC_TYPE=clang
 override CC=xcrun clang
 
 include Makefile.rules
diff --git a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
index b24fe3f574ccf..c77a186724fda 100644
--- a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
+++ b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
@@ -5,6 +5,7 @@ override TRIPLE := $(ARCH)-apple-ios13.0-macabi
 CFLAGS_EXTRAS := -target $(TRIPLE)
 
 # FIXME: rdar://problem/54986190
+override CC_TYPE=clang
 override CC=xcrun clang
 
 all: libfoo.dylib a.out
diff --git a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
index b712afdd7560a..3f5645a486bcb 100644
--- a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
+++ b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
@@ -59,10 +59,10 @@ def run_with(self, arch, os, vers, env, expected_load_command):
         self.build(
             dictionary={
                 "ARCH": arch,
-                "CC": clang,
                 "ARCH_CFLAGS": "-target {} {}".format(triple, version_min),
                 "SDKROOT": sdk_root,
-            }
+            },
+            compiler=clang,
         )
 
         self.check_load_commands(expected_load_command)
diff --git a/lldb/test/API/python_api/frame/inlines/Makefile b/lldb/test/API/python_api/frame/inlines/Makefile
index e6d9d8310a0fa..cf17569a5e351 100644
--- a/lldb/test/API/python_api/frame/inlines/Makefile
+++ b/lldb/test/API/python_api/frame/inlines/Makefile
@@ -1,6 +1,6 @@
 C_SOURCES := inlines.c
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
index 16297efe14372..ed47f94e9492b 100644
--- a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
+++ b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
@@ -71,12 +71,12 @@ def check_simulator_ostype(self, sdk, platform_name, arch=platform.machine()):
         self.build(
             dictionary={
                 "EXE": exe_name,
-                "CC": clang,
                 "SDKROOT": sdkroot.strip(),
                 "ARCH": arch,
                 "ARCH_CFLAGS": "-target {} {}".format(triple, version_min),
                 "USE_SYSTEM_STDLIB": 1,
-            }
+            },
+            compiler=clang,
         )
         exe_path = os.path.realpath(self.getBuildArtifact(exe_name))
         cmd = [
diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp b/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp
new file mode 100644
index 0000000000000..1c9cc36a711b4
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp
@@ -0,0 +1,28 @@
+// LLDB currently erroneously adds an unnamed bitfield
+// into the AST when an overlapping no_unique_address
+// field precedes a bitfield.
+
+// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s
+// RUN: %lldb %t \
+// RUN:   -o "target var global" \
+// RUN:   -o "image dump ast" \
+// RUN:   -o exit | FileCheck %s
+
+// CHECK:      (lldb) image dump ast
+// CHECK:      CXXRecordDecl {{.*}} struct Foo definition
+// CHECK:      |-FieldDecl {{.*}} data 'char[5]'
+// CHECK-NEXT: |-FieldDecl {{.*}} padding 'Empty'
+// CHECK-NEXT: |-FieldDecl {{.*}} 'int'
+// CHECK-NEXT: | `-IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: `-FieldDecl {{.*}} sloc> flag 'unsigned long'
+// CHECK-NEXT:   `-IntegerLiteral {{.*}} 'int' 1
+
+struct Empty {};
+
+struct Foo {
+  char data[5];
+  [[no_unique_address]] Empty padding;
+  unsigned long flag : 1;
+};
+
+Foo global;
diff --git a/llvm/cmake/modules/VersionFromVCS.cmake b/llvm/cmake/modules/VersionFromVCS.cmake
index 18edbeabe3e4b..da42781d2ae39 100644
--- a/llvm/cmake/modules/VersionFromVCS.cmake
+++ b/llvm/cmake/modules/VersionFromVCS.cmake
@@ -39,6 +39,30 @@ function(get_source_info path revision repository)
         OUTPUT_VARIABLE git_output
         ERROR_QUIET)
       if(git_result EQUAL 0)
+        # Passwords or tokens should not be stored in the remote URL at the
+        # risk of being leaked. In case we find one, error out and teach the
+        # user the best practices.
+        string(REGEX MATCH "https?://[^/]*:[^/]*@.*"
+          http_password "${git_output}")
+        if(http_password)
+          message(SEND_ERROR "The git remote repository URL has an embedded \
+password. Remove the password from the URL or use \
+`-DLLVM_FORCE_VC_REPOSITORY=<URL without password>` in order to avoid \
+leaking your password (see https://git-scm.com/docs/gitcredentials for \
+alternatives).")
+        endif()
+        # GitHub token formats are described at:
+        # https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/about-authentication-to-github#githubs-token-formats
+        string(REGEX MATCH
+          "https?://(gh[pousr]|github_pat)_[^/]+@github.com.*"
+          github_token "${git_output}")
+        if(github_token)
+          message(SEND_ERROR "The git remote repository URL has an embedded \
+GitHub Token. Remove the token from the URL or use \
+`-DLLVM_FORCE_VC_REPOSITORY=<URL without token>` in order to avoid leaking \
+your token (see https://git-scm.com/docs/gitcredentials for alternatives).")
+        endif()
+
         string(STRIP "${git_output}" git_output)
         set(${repository} ${git_output} PARENT_SCOPE)
       else()
diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst
index 49ec310b382f9..f74adc4702d38 100644
--- a/llvm/docs/DeveloperPolicy.rst
+++ b/llvm/docs/DeveloperPolicy.rst
@@ -1077,6 +1077,8 @@ If you have questions or comments about these topics, please ask on the
 please realize that most compiler developers are not lawyers, and therefore you
 will not be getting official legal advice.
 
+.. _LLVM Discourse forums: https://discourse.llvm.org
+
 Copyright
 ---------
 
@@ -1301,4 +1303,28 @@ to move code from (e.g.)  libc++ to the LLVM core without concern, but that code
 cannot be moved from the LLVM core to libc++ without the copyright owner's
 permission.
 
-.. _LLVM Discourse forums: https://discourse.llvm.org
+.. _ai contributions:
+
+AI generated contributions
+--------------------------
+
+Artificial intelligence systems raise many questions around copyright that have
+yet to be answered. Our policy on AI tools is guided by our copyright policy:
+Contributors are responsible for ensuring that they have the right to contribute
+code under the terms of our license, typically meaning that either they, their
+employer, or their collaborators hold the copyright. Using AI tools to
+regenerate copyrighted material does not remove the copyright, and contributors
+are responsible for ensuring that such material does not appear in their
+contributions.
+
+As such, the LLVM policy is that contributors are permitted to use artificial
+intelligence tools to produce contributions, provided that they have the right
+to license that code under the project license. Contributions found to violate
+this policy will be removed just like any other offending contribution.
+
+While the LLVM project has a liberal policy on AI tool use, contributors are
+considered responsible for their contributions. We encourage contributors to
+review all generated code before sending it for review to verify its
+correctness and to understand it so that they can answer questions during code
+review. Reviewing and maintaining generated code that the original contributor
+does not understand is not a good use of limited project resources.
diff --git a/llvm/docs/FAQ.rst b/llvm/docs/FAQ.rst
index 229ac99f703c1..aa20de47a6998 100644
--- a/llvm/docs/FAQ.rst
+++ b/llvm/docs/FAQ.rst
@@ -22,6 +22,13 @@ Yes. This is why we distribute LLVM under a less restrictive license than GPL,
 as explained in the first question above.
 
 
+Can I use AI coding tools, such as GitHub co-pilot, to write LLVM patches?
+--------------------------------------------------------------------------
+Yes, as long as the resulting work can be licensed under the project license, as
+covered in the :doc:`DeveloperPolicy`. Using an AI tool to reproduce copyrighted
+work does not rinse it of copyright and grant you the right to relicense it.
+
+
 Source Code
 ===========
 
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 8846b82fcaea5..a15af9adfa945 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -281,6 +281,13 @@ Supported
 ``Za128rs``, ``Za64rs``, ``Zama16b``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare``
   These extensions are defined as part of the `RISC-V Profiles specification <https://github.com/riscv/riscv-profiles/releases/tag/v1.0>`__.  They do not introduce any new features themselves, but instead describe existing hardware features.
 
+Atomics ABIs
+============
+
+At the time of writing there are three atomics mappings (ABIs) `defined for RISC-V <https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#tag_riscv_atomic_abi-14-uleb128version>`__.  As of LLVM 19, LLVM defaults to "A6S", which is compatible with both the original "A6" and the future "A7" ABI. See `the psABI atomics document <https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-atomic.adoc>`__ for more information on these mappings.
+
+Note that although the "A6S" mapping is used, the ELF attribute recording the mapping isn't currently emitted by default due to a bug causing a crash in older versions of binutils when processing files containing this attribute.
+
 Experimental Extensions
 =======================
 
diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h
index acf89885af6fd..9b21d6d65c2a8 100644
--- a/llvm/include/llvm/BinaryFormat/Wasm.h
+++ b/llvm/include/llvm/BinaryFormat/Wasm.h
@@ -144,6 +144,14 @@ enum : unsigned {
   WASM_OPCODE_I32_RMW_CMPXCHG = 0x48,
 };
 
+// Sub-opcodes for catch clauses in a try_table instruction
+enum : unsigned {
+  WASM_OPCODE_CATCH = 0x00,
+  WASM_OPCODE_CATCH_REF = 0x01,
+  WASM_OPCODE_CATCH_ALL = 0x02,
+  WASM_OPCODE_CATCH_ALL_REF = 0x03,
+};
+
 enum : unsigned {
   WASM_LIMITS_FLAG_NONE = 0x0,
   WASM_LIMITS_FLAG_HAS_MAX = 0x1,
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9b62d6067be39..828532dcffb7d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -831,6 +831,12 @@ class CombinerHelper {
   /// Combine ors.
   bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// trunc (binop X, C) --> binop (trunc X, trunc C).
+  bool matchNarrowBinop(const MachineInstr &TruncMI,
+                        const MachineInstr &BinopMI, BuildFnTy &MatchInfo);
+
+  bool matchCastOfInteger(const MachineInstr &CastMI, APInt &MatchInfo);
+
   /// Combine addos.
   bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo);
 
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 197f66e8659d5..ebf06bc57948f 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1204,15 +1204,6 @@ class TargetRegisterInfo : public MCRegisterInfo {
     return false;
   }
 
-  /// Returns the Largest Super Class that is being initialized. There
-  /// should be a Pseudo Instruction implemented for the super class
-  /// that is being returned to ensure that Init Undef can apply the
-  /// initialization correctly.
-  virtual const TargetRegisterClass *
-  getLargestSuperClass(const TargetRegisterClass *RC) const {
-    llvm_unreachable("Unexpected target register class.");
-  }
-
   /// Returns if the architecture being targeted has the required Pseudo
   /// Instructions for initializing the register. By default this returns false,
   /// but where it is overriden for an architecture, the behaviour will be
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h b/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
index 3a71ddc88ce95..2660b9f74f405 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_LLJIT_H
 #define LLVM_EXECUTIONENGINE_ORC_LLJIT_H
 
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
@@ -620,6 +621,7 @@ class ORCPlatformSupport : public LLJIT::PlatformSupport {
 private:
   orc::LLJIT &J;
   DenseMap<orc::JITDylib *, orc::ExecutorAddr> DSOHandles;
+  SmallPtrSet<JITDylib const *, 8> InitializedDylib;
 };
 
 } // End namespace orc
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2085113992ad1..e20c26eb83787 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1026,18 +1026,14 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
         }
   }
 
-  multiclass AMDGPUImageDimAtomic<string opmod, LLVMType rettype = llvm_anyint_ty> {
-    defm ""
-        : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], rettype>;
-  }
+  multiclass AMDGPUImageDimAtomic<string opmod, LLVMType rettype = llvm_anyint_ty> :
+    AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], rettype>;
 
-  multiclass AMDGPUImageDimFloatAtomic<string opmod> {
-    defm "" : AMDGPUImageDimAtomic<opmod, llvm_anyfloat_ty>;
-  }
+  multiclass AMDGPUImageDimFloatAtomic<string opmod> :
+    AMDGPUImageDimAtomic<opmod, llvm_anyfloat_ty>;
 
-  multiclass AMDGPUImageDimAnyAtomic<string opmod> {
-    defm "" : AMDGPUImageDimAtomic<opmod, llvm_any_ty>;
-  }
+  multiclass AMDGPUImageDimAnyAtomic<string opmod> :
+    AMDGPUImageDimAtomic<opmod, llvm_any_ty>;
 
   defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAnyAtomic<"ATOMIC_SWAP">;
   defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">;
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index fc6e93606de12..214aa4e1c562d 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -19,7 +19,6 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/LTO/Config.h"
diff --git a/llvm/include/llvm/SandboxIR/Pass.h b/llvm/include/llvm/SandboxIR/Pass.h
index d659e96839213..caf1c70a84147 100644
--- a/llvm/include/llvm/SandboxIR/Pass.h
+++ b/llvm/include/llvm/SandboxIR/Pass.h
@@ -37,8 +37,8 @@ class Pass {
     Pass.print(OS);
     return OS;
   }
-  void print(raw_ostream &OS) const { OS << Name; }
-  LLVM_DUMP_METHOD void dump() const;
+  virtual void print(raw_ostream &OS) const { OS << Name; }
+  LLVM_DUMP_METHOD virtual void dump() const;
 #endif
 };
 
diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h
new file mode 100644
index 0000000000000..5e250641f3b3f
--- /dev/null
+++ b/llvm/include/llvm/SandboxIR/PassManager.h
@@ -0,0 +1,99 @@
+//===- PassManager.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Registers and executes the Sandbox IR passes.
+//
+// The pass manager contains an ordered sequence of passes that it runs in
+// order. The passes are owned by the PassRegistry, not by the PassManager.
+//
+// Note that in this design a pass manager is also a pass. So a pass manager
+// runs when it is it's turn to run in its parent pass-manager pass pipeline.
+//
+
+#ifndef LLVM_SANDBOXIR_PASSMANAGER_H
+#define LLVM_SANDBOXIR_PASSMANAGER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/SandboxIR/Pass.h"
+#include "llvm/Support/Debug.h"
+
+namespace llvm::sandboxir {
+
+class Value;
+
+/// Base class.
+template <typename ParentPass, typename ContainedPass>
+class PassManager : public ParentPass {
+protected:
+  /// The list of passes that this pass manager will run.
+  SmallVector<ContainedPass *> Passes;
+
+  PassManager(StringRef Name) : ParentPass(Name) {}
+  PassManager(const PassManager &) = delete;
+  virtual ~PassManager() = default;
+  PassManager &operator=(const PassManager &) = delete;
+
+public:
+  /// Adds \p Pass to the pass pipeline.
+  void addPass(ContainedPass *Pass) {
+    // TODO: Check that Pass's class type works with this PassManager type.
+    Passes.push_back(Pass);
+  }
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const override {
+    OS << this->getName();
+    OS << "(";
+    interleave(Passes, OS, [&OS](auto *Pass) { OS << Pass->getName(); }, ",");
+    OS << ")";
+  }
+  LLVM_DUMP_METHOD void dump() const override {
+    print(dbgs());
+    dbgs() << "\n";
+  }
+#endif
+};
+
+class FunctionPassManager final
+    : public PassManager<FunctionPass, FunctionPass> {
+public:
+  FunctionPassManager(StringRef Name) : PassManager(Name) {}
+  bool runOnFunction(Function &F) final;
+};
+
+/// Owns the passes and provides an API to get a pass by its name.
+class PassRegistry {
+  SmallVector<std::unique_ptr<Pass>, 8> Passes;
+  DenseMap<StringRef, Pass *> NameToPassMap;
+
+public:
+  PassRegistry() = default;
+  /// Registers \p PassPtr and takes ownership.
+  Pass &registerPass(std::unique_ptr<Pass> &&PassPtr) {
+    auto &PassRef = *PassPtr.get();
+    NameToPassMap[PassRef.getName()] = &PassRef;
+    Passes.push_back(std::move(PassPtr));
+    return PassRef;
+  }
+  /// \Returns the pass with name \p Name, or null if not registered.
+  Pass *getPassByName(StringRef Name) const {
+    auto It = NameToPassMap.find(Name);
+    return It != NameToPassMap.end() ? It->second : nullptr;
+  }
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const {
+    for (const auto &PassPtr : Passes)
+      OS << PassPtr->getName() << "\n";
+  }
+  LLVM_DUMP_METHOD void dump() const;
+#endif
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_SANDBOXIR_PASSMANAGER_H
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 8f025f7257b39..2fdbbbd094650 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -123,6 +123,7 @@ class ConstantFP;
 class ConstantAggregateZero;
 class ConstantPointerNull;
 class PoisonValue;
+class BlockAddress;
 class Context;
 class Function;
 class Instruction;
@@ -323,6 +324,7 @@ class Value {
   friend class ConstantPointerNull;   // For `Val`.
   friend class UndefValue;            // For `Val`.
   friend class PoisonValue;           // For `Val`.
+  friend class BlockAddress;          // For `Val`.
 
   /// All values point to the context.
   Context &Ctx;
@@ -1112,6 +1114,33 @@ class PoisonValue final : public UndefValue {
 #endif
 };
 
+class BlockAddress final : public Constant {
+  BlockAddress(llvm::BlockAddress *C, Context &Ctx)
+      : Constant(ClassID::BlockAddress, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// Return a BlockAddress for the specified function and basic block.
+  static BlockAddress *get(Function *F, BasicBlock *BB);
+
+  /// Return a BlockAddress for the specified basic block.  The basic
+  /// block must be embedded into a function.
+  static BlockAddress *get(BasicBlock *BB);
+
+  /// Lookup an existing \c BlockAddress constant for the given BasicBlock.
+  ///
+  /// \returns 0 if \c !BB->hasAddressTaken(), otherwise the \c BlockAddress.
+  static BlockAddress *lookup(const BasicBlock *BB);
+
+  Function *getFunction() const;
+  BasicBlock *getBasicBlock() const;
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::BlockAddress;
+  }
+};
+
 /// Iterator for `Instruction`s in a `BasicBlock.
 /// \Returns an sandboxir::Instruction & when derereferenced.
 class BBIterator {
@@ -1194,9 +1223,7 @@ class BasicBlock : public Value {
   Instruction &back() const;
 
 #ifndef NDEBUG
-  void verify() const final {
-    assert(isa<llvm::BasicBlock>(Val) && "Expected BasicBlock!");
-  }
+  void verify() const final;
   void dumpOS(raw_ostream &OS) const final;
 #endif
 };
@@ -1435,7 +1462,7 @@ template <typename LLVMT> class SingleLLVMInstructionImpl : public Instruction {
 #endif
 };
 
-class FenceInst : public SingleLLVMInstructionImpl<llvm::SelectInst> {
+class FenceInst : public SingleLLVMInstructionImpl<llvm::FenceInst> {
   FenceInst(llvm::FenceInst *FI, Context &Ctx)
       : SingleLLVMInstructionImpl(ClassID::Fence, Opcode::Fence, FI, Ctx) {}
   friend Context; // For constructor;
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index 459226216703d..c29e8be24ea75 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -34,6 +34,7 @@ DEF_CONST(ConstantAggregateZero, ConstantAggregateZero)
 DEF_CONST(ConstantPointerNull, ConstantPointerNull)
 DEF_CONST(UndefValue, UndefValue)
 DEF_CONST(PoisonValue, PoisonValue)
+DEF_CONST(BlockAddress, BlockAddress)
 
 #ifndef DEF_INSTR
 #define DEF_INSTR(ID, OPCODE, CLASS)
diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h
index 44aee4e4a5b46..a2ac9e014b44a 100644
--- a/llvm/include/llvm/SandboxIR/Type.h
+++ b/llvm/include/llvm/SandboxIR/Type.h
@@ -25,6 +25,8 @@ class Context;
 // Forward declare friend classes for MSVC.
 class PointerType;
 class VectorType;
+class FixedVectorType;
+class ScalableVectorType;
 class IntegerType;
 class FunctionType;
 class ArrayType;
@@ -38,20 +40,22 @@ class StructType;
 class Type {
 protected:
   llvm::Type *LLVMTy;
-  friend class ArrayType;      // For LLVMTy.
-  friend class StructType;     // For LLVMTy.
-  friend class VectorType;     // For LLVMTy.
-  friend class PointerType;    // For LLVMTy.
-  friend class FunctionType;   // For LLVMTy.
-  friend class IntegerType;    // For LLVMTy.
-  friend class Function;       // For LLVMTy.
-  friend class CallBase;       // For LLVMTy.
-  friend class ConstantInt;    // For LLVMTy.
-  friend class ConstantArray;  // For LLVMTy.
-  friend class ConstantStruct; // For LLVMTy.
-  friend class ConstantVector; // For LLVMTy.
-  friend class CmpInst;        // For LLVMTy. TODO: Cleanup after
-                               // sandboxir::VectorType is more complete.
+  friend class ArrayType;          // For LLVMTy.
+  friend class StructType;         // For LLVMTy.
+  friend class VectorType;         // For LLVMTy.
+  friend class FixedVectorType;    // For LLVMTy.
+  friend class ScalableVectorType; // For LLVMTy.
+  friend class PointerType;        // For LLVMTy.
+  friend class FunctionType;       // For LLVMTy.
+  friend class IntegerType;        // For LLVMTy.
+  friend class Function;           // For LLVMTy.
+  friend class CallBase;           // For LLVMTy.
+  friend class ConstantInt;        // For LLVMTy.
+  friend class ConstantArray;      // For LLVMTy.
+  friend class ConstantStruct;     // For LLVMTy.
+  friend class ConstantVector;     // For LLVMTy.
+  friend class CmpInst;            // For LLVMTy. TODO: Cleanup after
+                                   // sandboxir::VectorType is more complete.
 
   // Friend all instruction classes because `create()` functions use LLVMTy.
 #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS;
@@ -344,6 +348,101 @@ class VectorType : public Type {
   }
 };
 
+class FixedVectorType : public VectorType {
+public:
+  static FixedVectorType *get(Type *ElementType, unsigned NumElts);
+
+  static FixedVectorType *get(Type *ElementType, const FixedVectorType *FVTy) {
+    return get(ElementType, FVTy->getNumElements());
+  }
+
+  static FixedVectorType *getInteger(FixedVectorType *VTy) {
+    return cast<FixedVectorType>(VectorType::getInteger(VTy));
+  }
+
+  static FixedVectorType *getExtendedElementVectorType(FixedVectorType *VTy) {
+    return cast<FixedVectorType>(VectorType::getExtendedElementVectorType(VTy));
+  }
+
+  static FixedVectorType *getTruncatedElementVectorType(FixedVectorType *VTy) {
+    return cast<FixedVectorType>(
+        VectorType::getTruncatedElementVectorType(VTy));
+  }
+
+  static FixedVectorType *getSubdividedVectorType(FixedVectorType *VTy,
+                                                  int NumSubdivs) {
+    return cast<FixedVectorType>(
+        VectorType::getSubdividedVectorType(VTy, NumSubdivs));
+  }
+
+  static FixedVectorType *getHalfElementsVectorType(FixedVectorType *VTy) {
+    return cast<FixedVectorType>(VectorType::getHalfElementsVectorType(VTy));
+  }
+
+  static FixedVectorType *getDoubleElementsVectorType(FixedVectorType *VTy) {
+    return cast<FixedVectorType>(VectorType::getDoubleElementsVectorType(VTy));
+  }
+
+  static bool classof(const Type *T) {
+    return isa<llvm::FixedVectorType>(T->LLVMTy);
+  }
+
+  unsigned getNumElements() const {
+    return cast<llvm::FixedVectorType>(LLVMTy)->getNumElements();
+  }
+};
+
+class ScalableVectorType : public VectorType {
+public:
+  static ScalableVectorType *get(Type *ElementType, unsigned MinNumElts);
+
+  static ScalableVectorType *get(Type *ElementType,
+                                 const ScalableVectorType *SVTy) {
+    return get(ElementType, SVTy->getMinNumElements());
+  }
+
+  static ScalableVectorType *getInteger(ScalableVectorType *VTy) {
+    return cast<ScalableVectorType>(VectorType::getInteger(VTy));
+  }
+
+  static ScalableVectorType *
+  getExtendedElementVectorType(ScalableVectorType *VTy) {
+    return cast<ScalableVectorType>(
+        VectorType::getExtendedElementVectorType(VTy));
+  }
+
+  static ScalableVectorType *
+  getTruncatedElementVectorType(ScalableVectorType *VTy) {
+    return cast<ScalableVectorType>(
+        VectorType::getTruncatedElementVectorType(VTy));
+  }
+
+  static ScalableVectorType *getSubdividedVectorType(ScalableVectorType *VTy,
+                                                     int NumSubdivs) {
+    return cast<ScalableVectorType>(
+        VectorType::getSubdividedVectorType(VTy, NumSubdivs));
+  }
+
+  static ScalableVectorType *
+  getHalfElementsVectorType(ScalableVectorType *VTy) {
+    return cast<ScalableVectorType>(VectorType::getHalfElementsVectorType(VTy));
+  }
+
+  static ScalableVectorType *
+  getDoubleElementsVectorType(ScalableVectorType *VTy) {
+    return cast<ScalableVectorType>(
+        VectorType::getDoubleElementsVectorType(VTy));
+  }
+
+  unsigned getMinNumElements() const {
+    return cast<llvm::ScalableVectorType>(LLVMTy)->getMinNumElements();
+  }
+
+  static bool classof(const Type *T) {
+    return isa<llvm::ScalableVectorType>(T->LLVMTy);
+  }
+};
+
 class FunctionType : public Type {
 public:
   // TODO: add missing functions
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 525cc815e73ce..a595a51d7b01f 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1867,6 +1867,33 @@ class buildvector_of_opcode<Instruction castOpcode> : GICombineRule <
 
 def buildvector_of_truncate : buildvector_of_opcode<G_TRUNC>;
 
+// narrow binop.
+// trunc (binop X, C) --> binop (trunc X, trunc C)
+class narrow_binop_opcode<Instruction binopOpcode> : GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (G_CONSTANT $const, $imm),
+         (binopOpcode $binop, $x, $const):$Binop,
+         (G_TRUNC $root, $binop):$Trunc,
+         [{ return Helper.matchNarrowBinop(*${Trunc}, *${Binop}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${Trunc}, ${matchinfo}); }])>;
+
+def narrow_binop_add : narrow_binop_opcode<G_ADD>;
+def narrow_binop_sub : narrow_binop_opcode<G_SUB>;
+def narrow_binop_mul : narrow_binop_opcode<G_MUL>;
+def narrow_binop_and : narrow_binop_opcode<G_AND>;
+def narrow_binop_or  : narrow_binop_opcode<G_OR>;
+def narrow_binop_xor : narrow_binop_opcode<G_XOR>;
+
+// Cast of integer.
+class integer_of_opcode<Instruction castOpcode> : GICombineRule <
+  (defs root:$root, apint_matchinfo:$matchinfo),
+  (match (G_CONSTANT $int, $imm),
+         (castOpcode $root, $int):$Cast,
+         [{ return Helper.matchCastOfInteger(*${Cast}, ${matchinfo}); }]),
+  (apply [{ Helper.replaceInstWithConstant(*${Cast}, ${matchinfo}); }])>;
+
+def integer_of_truncate : integer_of_opcode<G_TRUNC>;
+
 def cast_combines: GICombineGroup<[
   truncate_of_zext,
   truncate_of_sext,
@@ -1881,7 +1908,14 @@ def cast_combines: GICombineGroup<[
   anyext_of_anyext,
   anyext_of_zext,
   anyext_of_sext,
-  buildvector_of_truncate
+  buildvector_of_truncate,
+  narrow_binop_add,
+  narrow_binop_sub,
+  narrow_binop_mul,
+  narrow_binop_and,
+  narrow_binop_or,
+  narrow_binop_xor,
+  integer_of_truncate
 ]>;
 
 
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index 0a6cc5951b706..70739709a810a 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -17,9 +17,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Error.h"
 #include <functional>
-#include <map>
 #include <memory>
-#include <string>
 #include <system_error>
 #include <utility>
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
index 8714fdabf6549..30557e6a2304e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -313,3 +313,49 @@ bool CombinerHelper::matchCastOfBuildVector(const MachineInstr &CastMI,
 
   return true;
 }
+
+bool CombinerHelper::matchNarrowBinop(const MachineInstr &TruncMI,
+                                      const MachineInstr &BinopMI,
+                                      BuildFnTy &MatchInfo) {
+  const GTrunc *Trunc = cast<GTrunc>(&TruncMI);
+  const GBinOp *BinOp = cast<GBinOp>(&BinopMI);
+
+  if (!MRI.hasOneNonDBGUse(BinOp->getReg(0)))
+    return false;
+
+  Register Dst = Trunc->getReg(0);
+  LLT DstTy = MRI.getType(Dst);
+
+  // Is narrow binop legal?
+  if (!isLegalOrBeforeLegalizer({BinOp->getOpcode(), {DstTy}}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto LHS = B.buildTrunc(DstTy, BinOp->getLHSReg());
+    auto RHS = B.buildTrunc(DstTy, BinOp->getRHSReg());
+    B.buildInstr(BinOp->getOpcode(), {Dst}, {LHS, RHS});
+  };
+
+  return true;
+}
+
+bool CombinerHelper::matchCastOfInteger(const MachineInstr &CastMI,
+                                        APInt &MatchInfo) {
+  const GExtOrTruncOp *Cast = cast<GExtOrTruncOp>(&CastMI);
+
+  APInt Input = getIConstantFromReg(Cast->getSrcReg(), MRI);
+
+  LLT DstTy = MRI.getType(Cast->getReg(0));
+
+  if (!isConstantLegalOrBeforeLegalizer(DstTy))
+    return false;
+
+  switch (Cast->getOpcode()) {
+  case TargetOpcode::G_TRUNC: {
+    MatchInfo = Input.trunc(DstTy.getScalarSizeInBits());
+    return true;
+  }
+  default:
+    return false;
+  }
+}
diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp
index 8d20f2668de6b..1613e413712d2 100644
--- a/llvm/lib/CodeGen/InitUndef.cpp
+++ b/llvm/lib/CodeGen/InitUndef.cpp
@@ -152,8 +152,7 @@ bool InitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
     if (Info.UsedLanes == Info.DefinedLanes)
       continue;
 
-    const TargetRegisterClass *TargetRegClass =
-        TRI->getLargestSuperClass(MRI->getRegClass(Reg));
+    const TargetRegisterClass *TargetRegClass = MRI->getRegClass(Reg);
 
     LaneBitmask NeedDef = Info.UsedLanes & ~Info.DefinedLanes;
 
@@ -172,8 +171,8 @@ bool InitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
     Register LatestReg = Reg;
     for (auto ind : SubRegIndexNeedInsert) {
       Changed = true;
-      const TargetRegisterClass *SubRegClass = TRI->getLargestSuperClass(
-          TRI->getSubRegisterClass(TargetRegClass, ind));
+      const TargetRegisterClass *SubRegClass =
+          TRI->getSubRegisterClass(TargetRegClass, ind);
       Register TmpInitSubReg = MRI->createVirtualRegister(SubRegClass);
       LLVM_DEBUG(dbgs() << "Register Class ID" << SubRegClass->getID() << "\n");
       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(),
@@ -199,8 +198,7 @@ bool InitUndef::fixupIllOperand(MachineInstr *MI, MachineOperand &MO) {
       dbgs() << "Emitting PseudoInitUndef Instruction for implicit register "
              << printReg(MO.getReg()) << '\n');
 
-  const TargetRegisterClass *TargetRegClass =
-      TRI->getLargestSuperClass(MRI->getRegClass(MO.getReg()));
+  const TargetRegisterClass *TargetRegClass = MRI->getRegClass(MO.getReg());
   LLVM_DEBUG(dbgs() << "Register Class ID" << TargetRegClass->getID() << "\n");
   Register NewReg = MRI->createVirtualRegister(TargetRegClass);
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a1cb74f43e605..2fa9e46eae506 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3496,7 +3496,6 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
 
   if (N->getOpcode() == ISD::ADD) {
     Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps);
-    Hi = DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2));
     SDValue Cmp;
     // Special case: X+1 has a carry out if X+1==0. This may reduce the live
     // range of X. We assume comparing with 0 is cheap.
@@ -3521,10 +3520,12 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
       Carry = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT),
                              DAG.getConstant(0, dl, NVT));
 
-    if (isAllOnesConstant(LoOps[1]) && isAllOnesConstant(HiOps[1]))
+    if (isAllOnesConstant(LoOps[1]) && isAllOnesConstant(HiOps[1])) {
       Hi = DAG.getNode(ISD::SUB, dl, NVT, HiOps[0], Carry);
-    else
+    } else {
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2));
       Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry);
+    }
   } else {
     Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps);
     Hi = DAG.getNode(ISD::SUB, dl, NVT, ArrayRef(HiOps, 2));
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b3307dc9b7730..03010c1df0014 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8616,10 +8616,7 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
   // If MinMax is NaN, let's quiet it.
   if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(LHS) &&
       !DAG.isKnownNeverNaN(RHS)) {
-    SDValue MinMaxQuiet =
-        DAG.getNode(ISD::FCANONICALIZE, DL, VT, MinMax, Flags);
-    MinMax =
-        DAG.getSelectCC(DL, MinMax, MinMax, MinMaxQuiet, MinMax, ISD::SETUO);
+    MinMax = DAG.getNode(ISD::FCANONICALIZE, DL, VT, MinMax, Flags);
   }
 
   // Fixup signed zero behavior.
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 2f9f4d33df017..19b3f3d6ea038 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -602,6 +602,7 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) {
   using llvm::orc::shared::SPSExecutorAddr;
   using llvm::orc::shared::SPSString;
   using SPSDLOpenSig = SPSExecutorAddr(SPSString, int32_t);
+  using SPSDLUpdateSig = int32_t(SPSExecutorAddr, int32_t);
   enum dlopen_mode : int32_t {
     ORC_RT_RTLD_LAZY = 0x1,
     ORC_RT_RTLD_NOW = 0x2,
@@ -612,9 +613,30 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) {
   auto &ES = J.getExecutionSession();
   auto MainSearchOrder = J.getMainJITDylib().withLinkOrderDo(
       [](const JITDylibSearchOrder &SO) { return SO; });
+  StringRef WrapperToCall = "__orc_rt_jit_dlopen_wrapper";
+  bool dlupdate = false;
+  if (ES.getTargetTriple().isOSBinFormatMachO()) {
+    if (InitializedDylib.contains(&JD)) {
+      WrapperToCall = "__orc_rt_jit_dlupdate_wrapper";
+      dlupdate = true;
+    } else
+      InitializedDylib.insert(&JD);
+  }
 
-  if (auto WrapperAddr = ES.lookup(
-          MainSearchOrder, J.mangleAndIntern("__orc_rt_jit_dlopen_wrapper"))) {
+  if (auto WrapperAddr =
+          ES.lookup(MainSearchOrder, J.mangleAndIntern(WrapperToCall))) {
+    if (dlupdate) {
+      int32_t result;
+      auto E = ES.callSPSWrapper<SPSDLUpdateSig>(WrapperAddr->getAddress(),
+                                                 result, DSOHandles[&JD],
+                                                 int32_t(ORC_RT_RTLD_LAZY));
+      if (E)
+        return E;
+      else if (result)
+        return make_error<StringError>("dlupdate failed",
+                                       inconvertibleErrorCode());
+      return Error::success();
+    }
     return ES.callSPSWrapper<SPSDLOpenSig>(WrapperAddr->getAddress(),
                                            DSOHandles[&JD], JD.getName(),
                                            int32_t(ORC_RT_RTLD_LAZY));
@@ -641,6 +663,7 @@ Error ORCPlatformSupport::deinitialize(orc::JITDylib &JD) {
       return make_error<StringError>("dlclose failed",
                                      inconvertibleErrorCode());
     DSOHandles.erase(&JD);
+    InitializedDylib.erase(&JD);
   } else
     return WrapperAddr.takeError();
   return Error::success();
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index a71afe1a3162f..e56d6b47799c0 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -428,6 +428,7 @@ MachOPlatform::standardRuntimeUtilityAliases() {
           {"___orc_rt_run_program", "___orc_rt_macho_run_program"},
           {"___orc_rt_jit_dlerror", "___orc_rt_macho_jit_dlerror"},
           {"___orc_rt_jit_dlopen", "___orc_rt_macho_jit_dlopen"},
+          {"___orc_rt_jit_dlupdate", "___orc_rt_macho_jit_dlupdate"},
           {"___orc_rt_jit_dlclose", "___orc_rt_macho_jit_dlclose"},
           {"___orc_rt_jit_dlsym", "___orc_rt_macho_jit_dlsym"},
           {"___orc_rt_log_error", "___orc_rt_log_error_to_stderr"}};
diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp
index 637af7aaa2453..00dd9c72c469c 100644
--- a/llvm/lib/IR/User.cpp
+++ b/llvm/lib/IR/User.cpp
@@ -113,7 +113,17 @@ MutableArrayRef<uint8_t> User::getDescriptor() {
 }
 
 bool User::isDroppable() const {
-  return isa<AssumeInst>(this) || isa<PseudoProbeInst>(this);
+  if (auto *II = dyn_cast<IntrinsicInst>(this)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::assume:
+    case Intrinsic::pseudoprobe:
+    case Intrinsic::experimental_noalias_scope_decl:
+      return true;
+    }
+  }
+  return false;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 5d9a5cbd18f15..a88124dacfaef 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -38,7 +38,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SHA1.h"
diff --git a/llvm/lib/SandboxIR/CMakeLists.txt b/llvm/lib/SandboxIR/CMakeLists.txt
index 2f047944e0335..03474be0c7b80 100644
--- a/llvm/lib/SandboxIR/CMakeLists.txt
+++ b/llvm/lib/SandboxIR/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_component_library(LLVMSandboxIR
   Pass.cpp
+  PassManager.cpp
   SandboxIR.cpp
   Tracker.cpp
   Type.cpp
diff --git a/llvm/lib/SandboxIR/Pass.cpp b/llvm/lib/SandboxIR/Pass.cpp
index 64e1b609a9f49..c6ec1aec48b19 100644
--- a/llvm/lib/SandboxIR/Pass.cpp
+++ b/llvm/lib/SandboxIR/Pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/SandboxIR/Pass.h"
+#include "llvm/SandboxIR/PassManager.h"
 #include "llvm/Support/Debug.h"
 
 using namespace llvm::sandboxir;
diff --git a/llvm/lib/SandboxIR/PassManager.cpp b/llvm/lib/SandboxIR/PassManager.cpp
new file mode 100644
index 0000000000000..2dd19e74734db
--- /dev/null
+++ b/llvm/lib/SandboxIR/PassManager.cpp
@@ -0,0 +1,28 @@
+//===- PassManager.cpp - Runs a pipeline of Sandbox IR passes -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/PassManager.h"
+#include "llvm/SandboxIR/SandboxIR.h"
+
+using namespace llvm::sandboxir;
+
+bool FunctionPassManager::runOnFunction(Function &F) {
+  bool Change = false;
+  for (FunctionPass *Pass : Passes) {
+    Change |= Pass->runOnFunction(F);
+    // TODO: run the verifier.
+  }
+  // TODO: Check ChangeAll against hashes before/after.
+  return Change;
+}
+#ifndef NDEBUG
+void PassRegistry::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 07472d1bff47b..18fdcda15a1a9 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -2489,6 +2489,32 @@ PoisonValue *PoisonValue::getElementValue(unsigned Idx) const {
       cast<llvm::PoisonValue>(Val)->getElementValue(Idx)));
 }
 
+BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) {
+  auto *LLVMC = llvm::BlockAddress::get(cast<llvm::Function>(F->Val),
+                                        cast<llvm::BasicBlock>(BB->Val));
+  return cast<BlockAddress>(F->getContext().getOrCreateConstant(LLVMC));
+}
+
+BlockAddress *BlockAddress::get(BasicBlock *BB) {
+  auto *LLVMC = llvm::BlockAddress::get(cast<llvm::BasicBlock>(BB->Val));
+  return cast<BlockAddress>(BB->getContext().getOrCreateConstant(LLVMC));
+}
+
+BlockAddress *BlockAddress::lookup(const BasicBlock *BB) {
+  auto *LLVMC = llvm::BlockAddress::lookup(cast<llvm::BasicBlock>(BB->Val));
+  return cast_or_null<BlockAddress>(BB->getContext().getValue(LLVMC));
+}
+
+Function *BlockAddress::getFunction() const {
+  return cast<Function>(
+      Ctx.getValue(cast<llvm::BlockAddress>(Val)->getFunction()));
+}
+
+BasicBlock *BlockAddress::getBasicBlock() const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::BlockAddress>(Val)->getBasicBlock()));
+}
+
 FunctionType *Function::getFunctionType() const {
   return cast<FunctionType>(
       Ctx.getType(cast<llvm::Function>(Val)->getFunctionType()));
@@ -2585,6 +2611,10 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
       It->second = std::unique_ptr<ConstantFP>(
           new ConstantFP(cast<llvm::ConstantFP>(C), *this));
       return It->second.get();
+    case llvm::Value::BlockAddressVal:
+      It->second = std::unique_ptr<BlockAddress>(
+          new BlockAddress(cast<llvm::BlockAddress>(C), *this));
+      return It->second.get();
     case llvm::Value::ConstantAggregateZeroVal: {
       auto *CAZ = cast<llvm::ConstantAggregateZero>(C);
       It->second = std::unique_ptr<ConstantAggregateZero>(
@@ -2640,7 +2670,7 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     return It->second.get();
   }
   if (auto *BB = dyn_cast<llvm::BasicBlock>(LLVMV)) {
-    assert(isa<BlockAddress>(U) &&
+    assert(isa<llvm::BlockAddress>(U) &&
            "This won't create a SBBB, don't call this function directly!");
     if (auto *SBBB = getValue(BB))
       return SBBB;
@@ -3173,7 +3203,7 @@ void BasicBlock::buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB) {
       Ctx.getOrCreateValue(Op);
     }
   }
-#if !defined(NDEBUG) && defined(SBVEC_EXPENSIVE_CHECKS)
+#if !defined(NDEBUG)
   verify();
 #endif
 }
@@ -3249,4 +3279,12 @@ void BasicBlock::dumpOS(raw_ostream &OS) const {
     }
   }
 }
+
+void BasicBlock::verify() const {
+  assert(isa<llvm::BasicBlock>(Val) && "Expected BasicBlock!");
+  for (const auto &I : *this) {
+    I.verify();
+  }
+}
+
 #endif // NDEBUG
diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp
index bf9f02e2ba311..87dcb726dde35 100644
--- a/llvm/lib/SandboxIR/Type.cpp
+++ b/llvm/lib/SandboxIR/Type.cpp
@@ -103,6 +103,17 @@ bool VectorType::isValidElementType(Type *ElemTy) {
   return llvm::VectorType::isValidElementType(ElemTy->LLVMTy);
 }
 
+FixedVectorType *FixedVectorType::get(Type *ElementType, unsigned NumElts) {
+  return cast<FixedVectorType>(ElementType->getContext().getType(
+      llvm::FixedVectorType::get(ElementType->LLVMTy, NumElts)));
+}
+
+ScalableVectorType *ScalableVectorType::get(Type *ElementType,
+                                            unsigned NumElts) {
+  return cast<ScalableVectorType>(ElementType->getContext().getType(
+      llvm::ScalableVectorType::get(ElementType->LLVMTy, NumElts)));
+}
+
 IntegerType *IntegerType::get(Context &Ctx, unsigned NumBits) {
   return cast<IntegerType>(
       Ctx.getType(llvm::IntegerType::get(Ctx.LLVMCtx, NumBits)));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 8d6e022e1e4d4..399aa9c633564 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -36,7 +36,7 @@ void initializeAMDGPURegBankSelectPass(PassRegistry &);
 FunctionPass *createGCNDPPCombinePass();
 FunctionPass *createSIAnnotateControlFlowLegacyPass();
 FunctionPass *createSIFoldOperandsLegacyPass();
-FunctionPass *createSIPeepholeSDWAPass();
+FunctionPass *createSIPeepholeSDWALegacyPass();
 FunctionPass *createSILowerI1CopiesLegacyPass();
 FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass();
 FunctionPass *createSIShrinkInstructionsLegacyPass();
@@ -163,8 +163,8 @@ extern char &GCNDPPCombineLegacyID;
 void initializeSIFoldOperandsLegacyPass(PassRegistry &);
 extern char &SIFoldOperandsLegacyID;
 
-void initializeSIPeepholeSDWAPass(PassRegistry &);
-extern char &SIPeepholeSDWAID;
+void initializeSIPeepholeSDWALegacyPass(PassRegistry &);
+extern char &SIPeepholeSDWALegacyID;
 
 void initializeSIShrinkInstructionsLegacyPass(PassRegistry &);
 extern char &SIShrinkInstructionsLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 58481fe9df239..97661bf9837f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -100,5 +100,6 @@ MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
 MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
 MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
 MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
+MACHINE_FUNCTION_PASS("si-peephole-sdwa", SIPeepholeSDWAPass())
 MACHINE_FUNCTION_PASS("si-shrink-instructions", SIShrinkInstructionsPass())
 #undef MACHINE_FUNCTION_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9c9c505139373..55d0de59bc49a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -39,6 +39,7 @@
 #include "SILoadStoreOptimizer.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIMachineScheduler.h"
+#include "SIPeepholeSDWA.h"
 #include "SIShrinkInstructions.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
@@ -415,7 +416,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIFixSGPRCopiesLegacyPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
   initializeSIFoldOperandsLegacyPass(*PR);
-  initializeSIPeepholeSDWAPass(*PR);
+  initializeSIPeepholeSDWALegacyPass(*PR);
   initializeSIShrinkInstructionsLegacyPass(*PR);
   initializeSIOptimizeExecMaskingPreRAPass(*PR);
   initializeSIOptimizeVGPRLiveRangePass(*PR);
@@ -1275,7 +1276,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
     addPass(&GCNDPPCombineLegacyID);
   addPass(&SILoadStoreOptimizerLegacyID);
   if (isPassEnabled(EnableSDWAPeephole)) {
-    addPass(&SIPeepholeSDWAID);
+    addPass(&SIPeepholeSDWALegacyID);
     addPass(&EarlyMachineLICMID);
     addPass(&MachineCSELegacyID);
     addPass(&SIFoldOperandsLegacyID);
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index c6668b24f4ef6..532ece8b16c5e 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1583,9 +1583,8 @@ multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isInt
   defm : BufferAtomicPat_Common<OpPrefix, vt, Inst # "_VBUFFER", isIntr>;
 }
 
-multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> {
-  defm : BufferAtomicPat<OpPrefix, vt, Inst, /* isIntr */ 1>;
-}
+multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> :
+  BufferAtomicPat<OpPrefix, vt, Inst, /* isIntr */ 1>;
 
 multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> {
   foreach RtnMode = ["ret", "noret"] in {
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 44872761760db..434336ef137ff 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1116,8 +1116,8 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
       Register SrcReg = MI.getOperand(1).getReg();
       Register DstReg = MI.getOperand(0).getReg();
       if (SrcReg == AMDGPU::SCC) {
-        Register SCCCopy = MRI->createVirtualRegister(
-            TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
+        Register SCCCopy =
+            MRI->createVirtualRegister(TRI->getWaveMaskRegClass());
         I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
                     MI.getDebugLoc(),
                     TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 736f714ac1a77..bbb1d0c5eba14 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4562,7 +4562,7 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock::iterator I(&MI);
 
-  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+  const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
   Register DstReg = MI.getOperand(0).getReg();
   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
   Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
@@ -5064,7 +5064,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
       return BB;
     }
 
-    const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+    const auto *CarryRC = TRI->getWaveMaskRegClass();
 
     Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -5296,7 +5296,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
 
     Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+    const auto *CondRC = TRI->getWaveMaskRegClass();
     Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
 
     const TargetRegisterClass *Src0RC = Src0.isReg()
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c6f28af1e5e73..87b213767b4fc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1231,8 +1231,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
                                      Register TrueReg,
                                      Register FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  const TargetRegisterClass *BoolXExecRC =
-    RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+  const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
          "Not a VGPR32 reg");
 
@@ -6417,7 +6416,7 @@ static void emitLoadScalarOpsFromVGPRLoop(
       ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
   unsigned AndOpc =
       ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+  const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
 
   MachineBasicBlock::iterator I = LoopBB.begin();
 
@@ -6565,7 +6564,7 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
   const DebugLoc &DL = MI.getDebugLoc();
   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+  const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
 
   // Save SCC. Waterfall Loop may overwrite SCC.
   Register SaveSCCReg;
@@ -6958,7 +6957,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
       Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
       Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
-      const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+      const auto *BoolXExecRC = RI.getWaveMaskRegClass();
       Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
       Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
 
@@ -7336,7 +7335,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
                        ? AMDGPU::V_ADDC_U32_e64
                        : AMDGPU::V_SUBB_U32_e64;
-    const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+    const auto *CarryRC = RI.getWaveMaskRegClass();
 
     Register CarryInReg = Inst.getOperand(4).getReg();
     if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
@@ -7711,8 +7710,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
 
   Register NewCondReg = CondReg;
   if (IsSCC) {
-    const TargetRegisterClass *TC =
-        RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+    const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
     NewCondReg = MRI.createVirtualRegister(TC);
 
     // Now look for the closest SCC def if it is a copy
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 1b52a48d068eb..23d04fae42015 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2014,7 +2014,7 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
   MachineOperand OffsetHi =
     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
 
-  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+  const auto *CarryRC = TRI->getWaveMaskRegClass();
   Register CarryReg = MRI->createVirtualRegister(CarryRC);
   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7af5e7388f841..4cc60f5097899 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -100,19 +100,25 @@ struct SIArgument {
   SIArgument() : IsRegister(false), StackOffset(0) {}
   SIArgument(const SIArgument &Other) {
     IsRegister = Other.IsRegister;
-    if (IsRegister) {
-      ::new ((void *)std::addressof(RegisterName))
-          StringValue(Other.RegisterName);
-    } else
+    if (IsRegister)
+      new (&RegisterName) StringValue(Other.RegisterName);
+    else
       StackOffset = Other.StackOffset;
     Mask = Other.Mask;
   }
   SIArgument &operator=(const SIArgument &Other) {
+    // Default-construct or destruct the old RegisterName in case of switching
+    // union members
+    if (IsRegister != Other.IsRegister) {
+      if (Other.IsRegister)
+        new (&RegisterName) StringValue();
+      else
+        RegisterName.~StringValue();
+    }
     IsRegister = Other.IsRegister;
-    if (IsRegister) {
-      ::new ((void *)std::addressof(RegisterName))
-          StringValue(Other.RegisterName);
-    } else
+    if (IsRegister)
+      RegisterName = Other.RegisterName;
+    else
       StackOffset = Other.StackOffset;
     Mask = Other.Mask;
     return *this;
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index d80e1277b2a8a..86cb0e6944ed7 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -19,6 +19,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "SIPeepholeSDWA.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -45,7 +46,7 @@ class SDWADstOperand;
 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
 using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
 
-class SIPeepholeSDWA : public MachineFunctionPass {
+class SIPeepholeSDWA {
 private:
   MachineRegisterInfo *MRI;
   const SIRegisterInfo *TRI;
@@ -57,14 +58,6 @@ class SIPeepholeSDWA : public MachineFunctionPass {
 
   std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
 
-public:
-  static char ID;
-
-  SIPeepholeSDWA() : MachineFunctionPass(ID) {
-    initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
   void matchSDWAOperands(MachineBasicBlock &MBB);
   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
   void pseudoOpConvertToVOP2(MachineInstr &MI,
@@ -72,8 +65,20 @@ class SIPeepholeSDWA : public MachineFunctionPass {
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
 
+public:
+  bool run(MachineFunction &MF);
+};
+
+class SIPeepholeSDWALegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
+
   StringRef getPassName() const override { return "SI Peephole SDWA"; }
 
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -192,17 +197,17 @@ class SDWADstPreserveOperand : public SDWADstOperand {
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
+INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,
+                false)
 
-char SIPeepholeSDWA::ID = 0;
+char SIPeepholeSDWALegacy::ID = 0;
 
-char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
+char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID;
 
-FunctionPass *llvm::createSIPeepholeSDWAPass() {
-  return new SIPeepholeSDWA();
+FunctionPass *llvm::createSIPeepholeSDWALegacyPass() {
+  return new SIPeepholeSDWALegacy();
 }
 
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
   switch(Sel) {
@@ -1235,10 +1240,17 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
   }
 }
 
-bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
+bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  return SIPeepholeSDWA().run(MF);
+}
+
+bool SIPeepholeSDWA::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
-  if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
+  if (!ST.hasSDWA())
     return false;
 
   MRI = &MF.getRegInfo();
@@ -1295,3 +1307,13 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
 
   return Ret;
 }
+
+PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF,
+                                          MachineFunctionAnalysisManager &) {
+  if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h
new file mode 100644
index 0000000000000..217867220f7d8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h
@@ -0,0 +1,24 @@
+//===--------- SIPeepholeSDWA.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIPEEPHOLESDWA_H
+#define LLVM_LIB_TARGET_AMDGPU_SIPEEPHOLESDWA_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class SIPeepholeSDWAPass : public PassInfoMixin<SIPeepholeSDWAPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIPEEPHOLESDWA_H
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 4c571a36e4896..2d1cd1bda3afe 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3428,8 +3428,7 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
         std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
   case AMDGPU::VCCRegBankID:
     assert(Size == 1);
-    return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
-                    : &AMDGPU::SReg_64_XEXECRegClass;
+    return getWaveMaskRegClass();
   case AMDGPU::SGPRRegBankID:
     return getSGPRClassForBitWidth(std::max(32u, Size));
   case AMDGPU::AGPRRegBankID:
@@ -3472,8 +3471,7 @@ SIRegisterInfo::getRegClass(unsigned RCID) const {
   case AMDGPU::SReg_1RegClassID:
     return getBoolRC();
   case AMDGPU::SReg_1_XEXECRegClassID:
-    return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
-      : &AMDGPU::SReg_64_XEXECRegClass;
+    return getWaveMaskRegClass();
   case -1:
     return nullptr;
   default:
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index f9d7ead4ff3ec..38ebda6cde1e5 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1527,13 +1527,18 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
   for (MachineInstr *MI : LowerToCopyInstrs) {
     LLVM_DEBUG(dbgs() << "simplify: " << *MI);
 
-    Register RecomputeReg = 0;
     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
       assert(MI->getNumExplicitOperands() == 3);
+
+      LiveInterval *RecomputeLI = nullptr;
       if (MI->getOperand(2).isReg())
-        RecomputeReg = MI->getOperand(2).getReg();
+        RecomputeLI = &LIS->getInterval(MI->getOperand(2).getReg());
+
       MI->removeOperand(2);
+
+      if (RecomputeLI)
+        LIS->shrinkToUses(RecomputeLI);
     } else {
       assert(MI->getNumExplicitOperands() == 2);
     }
@@ -1550,11 +1555,6 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
 
     MI->setDesc(TII->get(CopyOp));
     LLVM_DEBUG(dbgs() << " -> " << *MI);
-
-    if (RecomputeReg) {
-      LIS->removeInterval(RecomputeReg);
-      LIS->createAndComputeVirtRegInterval(RecomputeReg);
-    }
   }
   return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
 }
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 03e4cb9fcf49b..8e5b61e8e492e 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -882,9 +882,8 @@ multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName,
   }
 }
 
-multiclass VOP1_Realtriple_e64<GFXGen Gen, bits<9> op> {
-  defm NAME : VOP3_Realtriple<Gen, {0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>;
-}
+multiclass VOP1_Realtriple_e64<GFXGen Gen, bits<9> op> :
+  VOP3_Realtriple<Gen, {0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>;
 
 multiclass VOP1_Realtriple_e64_with_name<GFXGen Gen, bits<9> op, string opName,
   string asmName> {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index fccaa27f36138..afae7a886288c 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1513,9 +1513,8 @@ multiclass VOP2be_Real_dpp8<GFXGen Gen, bits<6> op, string opName, string asmNam
 }
 
 // We don't want to override separate decoderNamespaces within these
-multiclass VOP2_Realtriple_e64<GFXGen Gen, bits<6> op> {
-  defm NAME : VOP3_Realtriple<Gen, {0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, NAME> ;
-}
+multiclass VOP2_Realtriple_e64<GFXGen Gen, bits<6> op> :
+  VOP3_Realtriple<Gen, {0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, NAME>;
 
 multiclass VOP2_Realtriple_e64_with_name<GFXGen Gen, bits<6> op, string opName,
                                                string asmName> {
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 53803cff8b90a..58b5e98fd30b1 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -241,19 +241,6 @@ class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
 
   int getSEHRegNum(unsigned i) const { return getEncodingValue(i); }
 
-  const TargetRegisterClass *
-  getLargestSuperClass(const TargetRegisterClass *RC) const override {
-    if (ARM::MQPRRegClass.hasSubClassEq(RC))
-      return &ARM::MQPRRegClass;
-    if (ARM::SPRRegClass.hasSubClassEq(RC))
-      return &ARM::SPRRegClass;
-    if (ARM::DPR_VFP2RegClass.hasSubClassEq(RC))
-      return &ARM::DPR_VFP2RegClass;
-    if (ARM::GPRRegClass.hasSubClassEq(RC))
-      return &ARM::GPRRegClass;
-    return RC;
-  }
-
   bool doesRegClassHavePseudoInitUndef(
       const TargetRegisterClass *RC) const override {
     (void)RC;
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index cd0d6d34e9a67..45aadac861946 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1345,7 +1345,7 @@ void DXILBitcodeWriter::writeValueAsMetadata(
     Ty = TypedPointerType::get(F->getFunctionType(), F->getAddressSpace());
   else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     Ty = TypedPointerType::get(GV->getValueType(), GV->getAddressSpace());
-  Record.push_back(getTypeID(Ty));
+  Record.push_back(getTypeID(Ty, V));
   Record.push_back(VE.getValueID(V));
   Stream.EmitRecord(bitc::METADATA_VALUE, Record, 0);
   Record.clear();
diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index 91ffbc4eb77dd..27b9ce60ba826 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -1139,6 +1139,43 @@ let AdditionalPredicates = [NotInMicroMips] in {
                 ISA_MIPS32R6;
 }
 
+// llvm.is_fpclass operations.
+def to_fclass_mask: SDNodeXForm<imm, [{
+  unsigned Check = N->getZExtValue();
+  unsigned Mask = 0;
+  if (Check & fcSNan)
+    Mask |= Mips::FClassMaskSignalingNaN;
+  if (Check & fcQNan)
+    Mask |= Mips::FClassMaskQuietNaN;
+  if (Check & fcPosInf)
+    Mask |= Mips::FClassMaskPositiveInfinity;
+  if (Check & fcNegInf)
+    Mask |= Mips::FClassMaskNegativeInfinity;
+  if (Check & fcPosNormal)
+    Mask |= Mips::FClassMaskPositiveNormal;
+  if (Check & fcNegNormal)
+    Mask |= Mips::FClassMaskNegativeNormal;
+  if (Check & fcPosSubnormal)
+    Mask |= Mips::FClassMaskPositiveSubnormal;
+  if (Check & fcNegSubnormal)
+    Mask |= Mips::FClassMaskNegativeSubnormal;
+  if (Check & fcPosZero)
+    Mask |= Mips::FClassMaskPositiveZero;
+  if (Check & fcNegZero)
+    Mask |= Mips::FClassMaskNegativeZero;
+  return CurDAG->getTargetConstant(Mask, SDLoc(N), MVT::i32);
+}]>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(is_fpclass f32:$lhs, i32:$imm),
+                (SLTu ZERO, (ANDi (MFC1 (CLASS_S f32:$lhs)),
+                          (to_fclass_mask imm:$imm)))>,
+                ISA_MIPS32R6;
+  def : MipsPat<(is_fpclass f64:$lhs, i32:$imm),
+                (SLTu ZERO, (ANDi (MFC1_D64 (CLASS_D f64:$lhs)),
+                          (to_fclass_mask imm:$imm)))>,
+                ISA_MIPS32R6;
+}
+
 // Pseudo instructions
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
     hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT], hasPostISelHook = 1 in {
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index fa57a3fa9b155..59f78a8ca306c 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -359,8 +359,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
 
-  // Lower fmin and fmax operations for MIPS R6.
-  // Instructions are defined but never used.
+  // Lower fmin/fmax/fclass operations for MIPS R6.
   if (Subtarget.hasMips32r6()) {
     setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
     setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
@@ -370,6 +369,8 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f64, Expand);
     setOperationAction(ISD::FMAXNUM, MVT::f64, Expand);
+    setOperationAction(ISD::IS_FPCLASS, MVT::f32, Legal);
+    setOperationAction(ISD::IS_FPCLASS, MVT::f64, Legal);
   } else {
     setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h
index dc4b9d99b39d2..4e039e0e32aba 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -213,6 +213,23 @@ class MipsInstrInfo : public MipsGenInstrInfo {
 const MipsInstrInfo *createMips16InstrInfo(const MipsSubtarget &STI);
 const MipsInstrInfo *createMipsSEInstrInfo(const MipsSubtarget &STI);
 
+namespace Mips {
+// Mask assignments for floating-point.
+enum FClassMask {
+  FClassMaskSignalingNaN = 1 << 0,
+  FClassMaskQuietNaN = 1 << 1,
+  FClassMaskNegativeInfinity = 1 << 2,
+  FClassMaskNegativeNormal = 1 << 3,
+  FClassMaskNegativeSubnormal = 1 << 4,
+  FClassMaskNegativeZero = 1 << 5,
+  FClassMaskPositiveInfinity = 1 << 6,
+  FClassMaskPositiveNormal = 1 << 7,
+  FClassMaskPositiveSubnormal = 1 << 8,
+  FClassMaskPositiveZero = 1 << 9
+};
+
+} // namespace Mips
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_MIPS_MIPSINSTRINFO_H
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 325a50c9f48a1..13212c2aea5dd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -169,7 +169,11 @@ Register RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
 bool RISCVInstrInfo::isReallyTriviallyReMaterializable(
     const MachineInstr &MI) const {
   switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
+  case RISCV::VMV_V_X:
+  case RISCV::VFMV_V_F:
   case RISCV::VMV_V_I:
+  case RISCV::VMV_S_X:
+  case RISCV::VFMV_S_F:
   case RISCV::VID_V:
     if (MI.getOperand(1).isUndef() &&
         /* After RISCVInsertVSETVLI most pseudos will have implicit uses on vl
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index e11f176bfe604..430e09fd834ba 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -2475,6 +2475,7 @@ multiclass VPseudoUnaryVMV_V_X_I {
         def "_V_" # mx : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                          SchedUnary<"WriteVIMovV", "ReadVIMovV", mx,
                                     forcePassthruRead=true>;
+        let isReMaterializable = 1 in
         def "_X_" # mx : VPseudoUnaryNoMask<m.vrclass, GPR>,
                          SchedUnary<"WriteVIMovX", "ReadVIMovX", mx,
                                     forcePassthruRead=true>;
@@ -6557,6 +6558,7 @@ defm PseudoVFMERGE : VPseudoVMRG_FM;
 //===----------------------------------------------------------------------===//
 // 13.16. Vector Floating-Point Move Instruction
 //===----------------------------------------------------------------------===//
+let isReMaterializable = 1 in
 defm PseudoVFMV_V : VPseudoVMV_F;
 
 //===----------------------------------------------------------------------===//
@@ -6762,7 +6764,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
     Pseudo<(outs GPR:$rd), (ins VR:$rs2, ixlenimm:$sew), []>,
     Sched<[WriteVMovXS, ReadVMovXS]>,
     RISCVVPseudo;
-  let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X,
+  let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1,
       Constraints = "$rd = $rs1" in
   def PseudoVMV_S_X: Pseudo<(outs VR:$rd),
                             (ins VR:$rs1, GPR:$rs2, AVL:$vl, ixlenimm:$sew),
@@ -6785,7 +6787,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
              (ins VR:$rs2, ixlenimm:$sew), []>,
       Sched<[WriteVMovFS, ReadVMovFS]>,
       RISCVVPseudo;
-    let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F,
+    let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1,
         Constraints = "$rd = $rs1" in
     def "PseudoVFMV_S_" # f.FX :
       Pseudo<(outs VR:$rd),
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 7e04e9154b524..98a712af08539 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -130,19 +130,6 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
                              const MachineFunction &MF, const VirtRegMap *VRM,
                              const LiveRegMatrix *Matrix) const override;
 
-  const TargetRegisterClass *
-  getLargestSuperClass(const TargetRegisterClass *RC) const override {
-    if (RISCV::VRM8RegClass.hasSubClassEq(RC))
-      return &RISCV::VRM8RegClass;
-    if (RISCV::VRM4RegClass.hasSubClassEq(RC))
-      return &RISCV::VRM4RegClass;
-    if (RISCV::VRM2RegClass.hasSubClassEq(RC))
-      return &RISCV::VRM2RegClass;
-    if (RISCV::VRRegClass.hasSubClassEq(RC))
-      return &RISCV::VRRegClass;
-    return RC;
-  }
-
   bool doesRegClassHavePseudoInitUndef(
       const TargetRegisterClass *RC) const override {
     return isVRRegClass(RC);
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 298f3317bf61a..026e5d653c38c 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -145,6 +145,24 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
   case RISCV::VMERGE_VVM:
     SrcIdx = 3; // TODO: We can also handle the false operand.
     break;
+  case RISCV::VREDSUM_VS:
+  case RISCV::VREDMAXU_VS:
+  case RISCV::VREDMAX_VS:
+  case RISCV::VREDMINU_VS:
+  case RISCV::VREDMIN_VS:
+  case RISCV::VREDAND_VS:
+  case RISCV::VREDOR_VS:
+  case RISCV::VREDXOR_VS:
+  case RISCV::VWREDSUM_VS:
+  case RISCV::VWREDSUMU_VS:
+  case RISCV::VFREDUSUM_VS:
+  case RISCV::VFREDOSUM_VS:
+  case RISCV::VFREDMAX_VS:
+  case RISCV::VFREDMIN_VS:
+  case RISCV::VFWREDUSUM_VS:
+  case RISCV::VFWREDOSUM_VS:
+    SrcIdx = 2;
+    break;
   }
 
   MachineOperand &VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
index 7c32bb1968ef5..832ca0ba5a82d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
@@ -13,6 +13,8 @@
 
 #include "SPIRVDuplicatesTracker.h"
 
+#define DEBUG_TYPE "build-dep-graph"
+
 using namespace llvm;
 
 template <typename T>
@@ -63,6 +65,18 @@ void SPIRVGeneralDuplicatesTracker::buildDepsGraph(
         if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL && i == 2)
           continue;
         MachineOperand *RegOp = &VRegDef->getOperand(0);
+        LLVM_DEBUG({
+          if (Reg2Entry.count(RegOp) == 0 &&
+              (MI->getOpcode() != SPIRV::OpVariable || i != 3)) {
+            dbgs() << "Unexpected pattern while building a dependency "
+                      "graph.\nInstruction: ";
+            MI->print(dbgs());
+            dbgs() << "Operand: ";
+            Op.print(dbgs());
+            dbgs() << "\nOperand definition: ";
+            VRegDef->print(dbgs());
+          }
+        });
         assert((MI->getOpcode() == SPIRV::OpVariable && i == 3) ||
                Reg2Entry.count(RegOp));
         if (Reg2Entry.count(RegOp))
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 1e861da35aaac..831d7f76ac14c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -607,10 +607,7 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
   case TargetOpcode::G_ADDRSPACE_CAST:
     return selectAddrSpaceCast(ResVReg, ResType, I);
   case TargetOpcode::G_PTR_ADD: {
-    // Currently, we get G_PTR_ADD only as a result of translating
-    // global variables, initialized with constant expressions like GV + Const
-    // (see test opencl/basic/progvar_prog_scope_init.ll).
-    // TODO: extend the handler once we have other cases.
+    // Currently, we get G_PTR_ADD only applied to global variables.
     assert(I.getOperand(1).isReg() && I.getOperand(2).isReg());
     Register GV = I.getOperand(1).getReg();
     MachineRegisterInfo::def_instr_iterator II = MRI->def_instr_begin(GV);
@@ -619,8 +616,68 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
             (*II).getOpcode() == TargetOpcode::COPY ||
             (*II).getOpcode() == SPIRV::OpVariable) &&
            isImm(I.getOperand(2), MRI));
-    Register Idx = buildZerosVal(GR.getOrCreateSPIRVIntegerType(32, I, TII), I);
+    // It may be the initialization of a global variable.
+    bool IsGVInit = false;
+    for (MachineRegisterInfo::use_instr_iterator
+             UseIt = MRI->use_instr_begin(I.getOperand(0).getReg()),
+             UseEnd = MRI->use_instr_end();
+         UseIt != UseEnd; UseIt = std::next(UseIt)) {
+      if ((*UseIt).getOpcode() == TargetOpcode::G_GLOBAL_VALUE ||
+          (*UseIt).getOpcode() == SPIRV::OpVariable) {
+        IsGVInit = true;
+        break;
+      }
+    }
     MachineBasicBlock &BB = *I.getParent();
+    if (!IsGVInit) {
+      SPIRVType *GVType = GR.getSPIRVTypeForVReg(GV);
+      SPIRVType *GVPointeeType = GR.getPointeeType(GVType);
+      SPIRVType *ResPointeeType = GR.getPointeeType(ResType);
+      if (GVPointeeType && ResPointeeType && GVPointeeType != ResPointeeType) {
+        // Build a new virtual register that is associated with the required
+        // data type.
+        Register NewVReg = MRI->createGenericVirtualRegister(MRI->getType(GV));
+        MRI->setRegClass(NewVReg, MRI->getRegClass(GV));
+        //  Having a correctly typed base we are ready to build the actually
+        //  required GEP. It may not be a constant though, because all Operands
+        //  of OpSpecConstantOp is to originate from other const instructions,
+        //  and only the AccessChain named opcodes accept a global OpVariable
+        //  instruction. We can't use an AccessChain opcode because of the type
+        //  mismatch between result and base types.
+        if (!GR.isBitcastCompatible(ResType, GVType))
+          report_fatal_error(
+              "incompatible result and operand types in a bitcast");
+        Register ResTypeReg = GR.getSPIRVTypeID(ResType);
+        MachineInstrBuilder MIB =
+            BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpBitcast))
+                .addDef(NewVReg)
+                .addUse(ResTypeReg)
+                .addUse(GV);
+        return MIB.constrainAllUses(TII, TRI, RBI) &&
+               BuildMI(BB, I, I.getDebugLoc(),
+                       TII.get(STI.isVulkanEnv()
+                                   ? SPIRV::OpInBoundsAccessChain
+                                   : SPIRV::OpInBoundsPtrAccessChain))
+                   .addDef(ResVReg)
+                   .addUse(ResTypeReg)
+                   .addUse(NewVReg)
+                   .addUse(I.getOperand(2).getReg())
+                   .constrainAllUses(TII, TRI, RBI);
+      } else {
+        return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp))
+            .addDef(ResVReg)
+            .addUse(GR.getSPIRVTypeID(ResType))
+            .addImm(
+                static_cast<uint32_t>(SPIRV::Opcode::InBoundsPtrAccessChain))
+            .addUse(GV)
+            .addUse(I.getOperand(2).getReg())
+            .constrainAllUses(TII, TRI, RBI);
+      }
+    }
+    // It's possible to translate G_PTR_ADD to OpSpecConstantOp: either to
+    // initialize a global variable with a constant expression (e.g., the test
+    // case opencl/basic/progvar_prog_scope_init.ll), or for another use case
+    Register Idx = buildZerosVal(GR.getOrCreateSPIRVIntegerType(32, I, TII), I);
     auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp))
                    .addDef(ResVReg)
                    .addUse(GR.getSPIRVTypeID(ResType))
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 96601dd8796c6..23cd32eff45d5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -1628,6 +1628,7 @@ multiclass OpcodeOperand<bits<32> value> {
   defm : SymbolicOperandWithRequirements<OpcodeOperand, value, NAME, 0, 0, [], []>;
 }
 // TODO: implement other mnemonics.
+defm InBoundsAccessChain : OpcodeOperand<66>;
 defm InBoundsPtrAccessChain : OpcodeOperand<70>;
 defm PtrCastToGeneric : OpcodeOperand<121>;
 defm Bitcast : OpcodeOperand<124>;
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 24a9ad67cfe04..5299e6ea06f0b 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -45,7 +45,7 @@ namespace {
 /// WebAssemblyOperand - Instances of this class represent the operands in a
 /// parsed Wasm machine instruction.
 struct WebAssemblyOperand : public MCParsedAsmOperand {
-  enum KindTy { Token, Integer, Float, Symbol, BrList } Kind;
+  enum KindTy { Token, Integer, Float, Symbol, BrList, CatchList } Kind;
 
   SMLoc StartLoc, EndLoc;
 
@@ -99,6 +99,7 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
   bool isMem() const override { return false; }
   bool isReg() const override { return false; }
   bool isBrList() const { return Kind == BrList; }
+  bool isCatchList() const { return Kind == CatchList; }
 
   MCRegister getReg() const override {
     llvm_unreachable("Assembly inspects a register operand");
@@ -151,6 +152,10 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
       Inst.addOperand(MCOperand::createImm(Br));
   }
 
+  void addCatchListOperands(MCInst &Inst, unsigned N) const {
+    // TODO
+  }
+
   void print(raw_ostream &OS) const override {
     switch (Kind) {
     case Token:
@@ -168,6 +173,9 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
     case BrList:
       OS << "BrList:" << BrL.List.size();
       break;
+    case CatchList:
+      // TODO
+      break;
     }
   }
 };
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
index 9f9e7d1c0ed06..ec3d51d4e0e84 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
@@ -112,9 +112,9 @@ bool WebAssemblyAsmTypeCheck::popRefType(SMLoc ErrorLoc) {
   return false;
 }
 
-bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCInst &Inst,
+bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCOperand &LocalOp,
                                        wasm::ValType &Type) {
-  auto Local = static_cast<size_t>(Inst.getOperand(0).getImm());
+  auto Local = static_cast<size_t>(LocalOp.getImm());
   if (Local >= LocalTypes.size())
     return typeError(ErrorLoc, StringRef("no local type specified for index ") +
                                    std::to_string(Local));
@@ -178,21 +178,21 @@ bool WebAssemblyAsmTypeCheck::checkSig(SMLoc ErrorLoc,
   return false;
 }
 
-bool WebAssemblyAsmTypeCheck::getSymRef(SMLoc ErrorLoc, const MCInst &Inst,
+bool WebAssemblyAsmTypeCheck::getSymRef(SMLoc ErrorLoc, const MCOperand &SymOp,
                                         const MCSymbolRefExpr *&SymRef) {
-  auto Op = Inst.getOperand(0);
-  if (!Op.isExpr())
+  if (!SymOp.isExpr())
     return typeError(ErrorLoc, StringRef("expected expression operand"));
-  SymRef = dyn_cast<MCSymbolRefExpr>(Op.getExpr());
+  SymRef = dyn_cast<MCSymbolRefExpr>(SymOp.getExpr());
   if (!SymRef)
     return typeError(ErrorLoc, StringRef("expected symbol operand"));
   return false;
 }
 
-bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCInst &Inst,
+bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc,
+                                        const MCOperand &GlobalOp,
                                         wasm::ValType &Type) {
   const MCSymbolRefExpr *SymRef;
-  if (getSymRef(ErrorLoc, Inst, SymRef))
+  if (getSymRef(ErrorLoc, GlobalOp, SymRef))
     return true;
   auto WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
   switch (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA)) {
@@ -217,10 +217,10 @@ bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCInst &Inst,
   return false;
 }
 
-bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCInst &Inst,
+bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCOperand &TableOp,
                                        wasm::ValType &Type) {
   const MCSymbolRefExpr *SymRef;
-  if (getSymRef(ErrorLoc, Inst, SymRef))
+  if (getSymRef(ErrorLoc, TableOp, SymRef))
     return true;
   auto WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
   if (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA) !=
@@ -231,6 +231,34 @@ bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCInst &Inst,
   return false;
 }
 
+bool WebAssemblyAsmTypeCheck::getSignature(SMLoc ErrorLoc,
+                                           const MCOperand &SigOp,
+                                           wasm::WasmSymbolType Type,
+                                           const wasm::WasmSignature *&Sig) {
+  const MCSymbolRefExpr *SymRef = nullptr;
+  if (getSymRef(ErrorLoc, SigOp, SymRef))
+    return true;
+  const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
+  Sig = WasmSym->getSignature();
+
+  if (!Sig || WasmSym->getType() != Type) {
+    const char *TypeName = nullptr;
+    switch (Type) {
+    case wasm::WASM_SYMBOL_TYPE_FUNCTION:
+      TypeName = "func";
+      break;
+    case wasm::WASM_SYMBOL_TYPE_TAG:
+      TypeName = "tag";
+      break;
+    default:
+      return true;
+    }
+    return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() +
+                                   ": missing ." + TypeName + "type");
+  }
+  return false;
+}
+
 bool WebAssemblyAsmTypeCheck::endOfFunction(SMLoc ErrorLoc) {
   // Check the return types.
   for (auto RVT : llvm::reverse(ReturnTypes)) {
@@ -252,48 +280,48 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
   dumpTypeStack("typechecking " + Name + ": ");
   wasm::ValType Type;
   if (Name == "local.get") {
-    if (getLocal(Operands[1]->getStartLoc(), Inst, Type))
+    if (getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     Stack.push_back(Type);
   } else if (Name == "local.set") {
-    if (getLocal(Operands[1]->getStartLoc(), Inst, Type))
+    if (getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, Type))
       return true;
   } else if (Name == "local.tee") {
-    if (getLocal(Operands[1]->getStartLoc(), Inst, Type))
+    if (getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, Type))
       return true;
     Stack.push_back(Type);
   } else if (Name == "global.get") {
-    if (getGlobal(Operands[1]->getStartLoc(), Inst, Type))
+    if (getGlobal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     Stack.push_back(Type);
   } else if (Name == "global.set") {
-    if (getGlobal(Operands[1]->getStartLoc(), Inst, Type))
+    if (getGlobal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, Type))
       return true;
   } else if (Name == "table.get") {
-    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+    if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, wasm::ValType::I32))
       return true;
     Stack.push_back(Type);
   } else if (Name == "table.set") {
-    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+    if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, Type))
       return true;
     if (popType(ErrorLoc, wasm::ValType::I32))
       return true;
   } else if (Name == "table.size") {
-    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+    if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     Stack.push_back(wasm::ValType::I32);
   } else if (Name == "table.grow") {
-    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+    if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, wasm::ValType::I32))
       return true;
@@ -301,7 +329,7 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
       return true;
     Stack.push_back(wasm::ValType::I32);
   } else if (Name == "table.fill") {
-    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+    if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, wasm::ValType::I32))
       return true;
@@ -352,15 +380,10 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
       return true;
     Unreachable = false;
     if (Name == "catch") {
-      const MCSymbolRefExpr *SymRef;
-      if (getSymRef(Operands[1]->getStartLoc(), Inst, SymRef))
+      const wasm::WasmSignature *Sig = nullptr;
+      if (getSignature(Operands[1]->getStartLoc(), Inst.getOperand(0),
+                       wasm::WASM_SYMBOL_TYPE_TAG, Sig))
         return true;
-      const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
-      const auto *Sig = WasmSym->getSignature();
-      if (!Sig || WasmSym->getType() != wasm::WASM_SYMBOL_TYPE_TAG)
-        return typeError(Operands[1]->getStartLoc(), StringRef("symbol ") +
-                                                         WasmSym->getName() +
-                                                         ": missing .tagtype");
       // catch instruction pushes values whose types are specified in the tag's
       // "params" part
       Stack.insert(Stack.end(), Sig->Params.begin(), Sig->Params.end());
@@ -383,15 +406,10 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
     if (Name == "return_call_indirect" && endOfFunction(ErrorLoc))
       return true;
   } else if (Name == "call" || Name == "return_call") {
-    const MCSymbolRefExpr *SymRef;
-    if (getSymRef(Operands[1]->getStartLoc(), Inst, SymRef))
-      return true;
-    auto WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
-    auto Sig = WasmSym->getSignature();
-    if (!Sig || WasmSym->getType() != wasm::WASM_SYMBOL_TYPE_FUNCTION)
-      return typeError(Operands[1]->getStartLoc(), StringRef("symbol ") +
-                                                       WasmSym->getName() +
-                                                       ": missing .functype");
+    const wasm::WasmSignature *Sig = nullptr;
+    if (getSignature(Operands[1]->getStartLoc(), Inst.getOperand(0),
+                     wasm::WASM_SYMBOL_TYPE_FUNCTION, Sig))
+      return true;
     if (checkSig(ErrorLoc, *Sig))
       return true;
     if (Name == "return_call" && endOfFunction(ErrorLoc))
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
index 6fa95c3929753..9ba5693719e91 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
@@ -41,14 +41,17 @@ class WebAssemblyAsmTypeCheck final {
   bool typeError(SMLoc ErrorLoc, const Twine &Msg);
   bool popType(SMLoc ErrorLoc, std::optional<wasm::ValType> EVT);
   bool popRefType(SMLoc ErrorLoc);
-  bool getLocal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
+  bool getLocal(SMLoc ErrorLoc, const MCOperand &LocalOp, wasm::ValType &Type);
   bool checkEnd(SMLoc ErrorLoc, bool PopVals = false);
   bool checkBr(SMLoc ErrorLoc, size_t Level);
   bool checkSig(SMLoc ErrorLoc, const wasm::WasmSignature &Sig);
-  bool getSymRef(SMLoc ErrorLoc, const MCInst &Inst,
+  bool getSymRef(SMLoc ErrorLoc, const MCOperand &SymOp,
                  const MCSymbolRefExpr *&SymRef);
-  bool getGlobal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
-  bool getTable(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
+  bool getGlobal(SMLoc ErrorLoc, const MCOperand &GlobalOp,
+                 wasm::ValType &Type);
+  bool getTable(SMLoc ErrorLoc, const MCOperand &TableOp, wasm::ValType &Type);
+  bool getSignature(SMLoc ErrorLoc, const MCOperand &SigOp,
+                    wasm::WasmSymbolType Type, const wasm::WasmSignature *&Sig);
 
 public:
   WebAssemblyAsmTypeCheck(MCAsmParser &Parser, const MCInstrInfo &MII,
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index b85ed1d93593b..903dbcd21ea96 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -367,3 +367,44 @@ void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
     }
   }
 }
+
+void WebAssemblyInstPrinter::printCatchList(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned OpIdx = OpNo;
+  const MCOperand &Op = MI->getOperand(OpIdx++);
+  unsigned NumCatches = Op.getImm();
+
+  auto PrintTagOp = [&](const MCOperand &Op) {
+    const MCSymbolRefExpr *TagExpr = nullptr;
+    const MCSymbolWasm *TagSym = nullptr;
+    assert(Op.isExpr());
+    TagExpr = dyn_cast<MCSymbolRefExpr>(Op.getExpr());
+    TagSym = cast<MCSymbolWasm>(&TagExpr->getSymbol());
+    O << TagSym->getName() << " ";
+  };
+
+  for (unsigned I = 0; I < NumCatches; I++) {
+    const MCOperand &Op = MI->getOperand(OpIdx++);
+    O << "(";
+    switch (Op.getImm()) {
+    case wasm::WASM_OPCODE_CATCH:
+      O << "catch ";
+      PrintTagOp(MI->getOperand(OpIdx++));
+      break;
+    case wasm::WASM_OPCODE_CATCH_REF:
+      O << "catch_ref ";
+      PrintTagOp(MI->getOperand(OpIdx++));
+      break;
+    case wasm::WASM_OPCODE_CATCH_ALL:
+      O << "catch_all ";
+      break;
+    case wasm::WASM_OPCODE_CATCH_ALL_REF:
+      O << "catch_all_ref ";
+      break;
+    }
+    O << MI->getOperand(OpIdx++).getImm(); // destination
+    O << ")";
+    if (I < NumCatches - 1)
+      O << " ";
+  }
+}
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index 8fd54d1640905..b499926ab8296 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -47,6 +47,7 @@ class WebAssemblyInstPrinter final : public MCInstPrinter {
                                       raw_ostream &O);
   void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O);
+  void printCatchList(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   // Autogenerated by tblgen.
   std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 00f15e1db5e13..e3a60fa4812d8 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -87,6 +87,8 @@ enum OperandType {
   OPERAND_BRLIST,
   /// 32-bit unsigned table number.
   OPERAND_TABLE,
+  /// A list of catch clauses for try_table.
+  OPERAND_CATCH_LIST,
 };
 } // end namespace WebAssembly
 
@@ -119,6 +121,10 @@ enum TOF {
   // address relative the __table_base wasm global.
   // Only applicable to function symbols.
   MO_TABLE_BASE_REL,
+
+  // On a block signature operand this indicates that this is a destination
+  // block of a (catch_ref) clause in try_table.
+  MO_CATCH_BLOCK_SIG,
 };
 
 } // end namespace WebAssemblyII
@@ -462,6 +468,22 @@ inline bool isMarker(unsigned Opc) {
   case WebAssembly::TRY_S:
   case WebAssembly::END_TRY:
   case WebAssembly::END_TRY_S:
+  case WebAssembly::TRY_TABLE:
+  case WebAssembly::TRY_TABLE_S:
+  case WebAssembly::END_TRY_TABLE:
+  case WebAssembly::END_TRY_TABLE_S:
+    return true;
+  default:
+    return false;
+  }
+}
+
+inline bool isTry(unsigned Opc) {
+  switch (Opc) {
+  case WebAssembly::TRY:
+  case WebAssembly::TRY_S:
+  case WebAssembly::TRY_TABLE:
+  case WebAssembly::TRY_TABLE_S:
     return true;
   default:
     return false;
@@ -474,6 +496,14 @@ inline bool isCatch(unsigned Opc) {
   case WebAssembly::CATCH_LEGACY_S:
   case WebAssembly::CATCH_ALL_LEGACY:
   case WebAssembly::CATCH_ALL_LEGACY_S:
+  case WebAssembly::CATCH:
+  case WebAssembly::CATCH_S:
+  case WebAssembly::CATCH_REF:
+  case WebAssembly::CATCH_REF_S:
+  case WebAssembly::CATCH_ALL:
+  case WebAssembly::CATCH_ALL_S:
+  case WebAssembly::CATCH_ALL_REF:
+  case WebAssembly::CATCH_ALL_REF_S:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
index 063ee4dba9068..4aca092e0e4c4 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
@@ -33,11 +33,15 @@ enum class BlockType : unsigned {
   Externref = unsigned(wasm::ValType::EXTERNREF),
   Funcref = unsigned(wasm::ValType::FUNCREF),
   Exnref = unsigned(wasm::ValType::EXNREF),
-  // Multivalue blocks (and other non-void blocks) are only emitted when the
-  // blocks will never be exited and are at the ends of functions (see
-  // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made
-  // to pop values off the stack, so the exact multivalue signature can always
-  // be inferred from the return type of the parent function in MCInstLower.
+  // Multivalue blocks are emitted in two cases:
+  // 1. When the blocks will never be exited and are at the ends of functions
+  //    (see WebAssemblyCFGStackify::fixEndsAtEndOfFunction). In this case the
+  //    exact multivalue signature can always be inferred from the return type
+  //    of the parent function.
+  // 2. (catch_ref ...) clause in try_table instruction. Currently all tags we
+  //    support (cpp_exception and c_longjmp) throws a single i32, so the
+  //    multivalue signature for this case will be (i32, exnref).
+  // The real multivalue siganture will be added in MCInstLower.
   Multivalue = 0xffff,
 };
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 6dd6145ed0057..14c0eaac17daa 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -683,6 +683,17 @@ void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // This is a compiler barrier that prevents instruction reordering during
     // backend compilation, and should not be emitted.
     break;
+  case WebAssembly::CATCH:
+  case WebAssembly::CATCH_S:
+  case WebAssembly::CATCH_REF:
+  case WebAssembly::CATCH_REF_S:
+  case WebAssembly::CATCH_ALL:
+  case WebAssembly::CATCH_ALL_S:
+  case WebAssembly::CATCH_ALL_REF:
+  case WebAssembly::CATCH_ALL_REF_S:
+    // These are pseudo instructions to represent catch clauses in try_table
+    // instruction to simulate block return values.
+    break;
   default: {
     WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
     MCInst TmpInst;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 3cccc57e629fd..a5f73fabca354 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -9,9 +9,9 @@
 /// \file
 /// This file implements a CFG stacking pass.
 ///
-/// This pass inserts BLOCK, LOOP, and TRY markers to mark the start of scopes,
-/// since scope boundaries serve as the labels for WebAssembly's control
-/// transfers.
+/// This pass inserts BLOCK, LOOP, TRY, and TRY_TABLE markers to mark the start
+/// of scopes, since scope boundaries serve as the labels for WebAssembly's
+/// control transfers.
 ///
 /// This is sufficient to convert arbitrary CFGs into a form that works on
 /// WebAssembly, provided that all loops are single-entry.
@@ -21,6 +21,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "Utils/WebAssemblyTypeUtilities.h"
 #include "WebAssembly.h"
 #include "WebAssemblyExceptionInfo.h"
@@ -29,6 +30,7 @@
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -74,6 +76,7 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
   void placeBlockMarker(MachineBasicBlock &MBB);
   void placeLoopMarker(MachineBasicBlock &MBB);
   void placeTryMarker(MachineBasicBlock &MBB);
+  void placeTryTableMarker(MachineBasicBlock &MBB);
 
   // Exception handling related functions
   bool fixCallUnwindMismatches(MachineFunction &MF);
@@ -97,11 +100,11 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
   void fixEndsAtEndOfFunction(MachineFunction &MF);
   void cleanupFunctionData(MachineFunction &MF);
 
-  // For each BLOCK|LOOP|TRY, the corresponding END_(BLOCK|LOOP|TRY) or DELEGATE
-  // (in case of TRY).
+  // For each BLOCK|LOOP|TRY|TRY_TABLE, the corresponding
+  // END_(BLOCK|LOOP|TRY|TRY_TABLE) or DELEGATE (in case of TRY).
   DenseMap<const MachineInstr *, MachineInstr *> BeginToEnd;
-  // For each END_(BLOCK|LOOP|TRY) or DELEGATE, the corresponding
-  // BLOCK|LOOP|TRY.
+  // For each END_(BLOCK|LOOP|TRY|TRY_TABLE) or DELEGATE, the corresponding
+  // BLOCK|LOOP|TRY|TRY_TABLE.
   DenseMap<const MachineInstr *, MachineInstr *> EndToBegin;
   // <TRY marker, EH pad> map
   DenseMap<const MachineInstr *, MachineBasicBlock *> TryToEHPad;
@@ -150,9 +153,10 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
 } // end anonymous namespace
 
 char WebAssemblyCFGStackify::ID = 0;
-INITIALIZE_PASS(WebAssemblyCFGStackify, DEBUG_TYPE,
-                "Insert BLOCK/LOOP/TRY markers for WebAssembly scopes", false,
-                false)
+INITIALIZE_PASS(
+    WebAssemblyCFGStackify, DEBUG_TYPE,
+    "Insert BLOCK/LOOP/TRY/TRY_TABLE markers for WebAssembly scopes", false,
+    false)
 
 FunctionPass *llvm::createWebAssemblyCFGStackify() {
   return new WebAssemblyCFGStackify();
@@ -314,12 +318,13 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
 #endif
     }
 
-    // If there is a previously placed BLOCK/TRY marker and its corresponding
-    // END marker is before the current BLOCK's END marker, that should be
-    // placed after this BLOCK. Otherwise it should be placed before this BLOCK
-    // marker.
+    // If there is a previously placed BLOCK/TRY/TRY_TABLE marker and its
+    // corresponding END marker is before the current BLOCK's END marker, that
+    // should be placed after this BLOCK. Otherwise it should be placed before
+    // this BLOCK marker.
     if (MI.getOpcode() == WebAssembly::BLOCK ||
-        MI.getOpcode() == WebAssembly::TRY) {
+        MI.getOpcode() == WebAssembly::TRY ||
+        MI.getOpcode() == WebAssembly::TRY_TABLE) {
       if (BeginToEnd[&MI]->getParent()->getNumber() <= MBB.getNumber())
         AfterSet.insert(&MI);
 #ifndef NDEBUG
@@ -329,10 +334,11 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
     }
 
 #ifndef NDEBUG
-    // All END_(BLOCK|LOOP|TRY) markers should be before the BLOCK.
+    // All END_(BLOCK|LOOP|TRY|TRY_TABLE) markers should be before the BLOCK.
     if (MI.getOpcode() == WebAssembly::END_BLOCK ||
         MI.getOpcode() == WebAssembly::END_LOOP ||
-        MI.getOpcode() == WebAssembly::END_TRY)
+        MI.getOpcode() == WebAssembly::END_TRY ||
+        MI.getOpcode() == WebAssembly::END_TRY_TABLE)
       BeforeSet.insert(&MI);
 #endif
 
@@ -374,6 +380,11 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
     // loop is above this block's header, the END_LOOP should be placed after
     // the END_BLOCK, because the loop contains this block. Otherwise the
     // END_LOOP should be placed before the END_BLOCK. The same for END_TRY.
+    //
+    // Note that while there can be existing END_TRYs, there can't be
+    // END_TRY_TABLEs; END_TRYs are placed when its corresponding EH pad is
+    // processed, so they are placed below MBB (EH pad) in placeTryMarker. But
+    // END_TRY_TABLE is placed like a END_BLOCK, so they can't be here already.
     if (MI.getOpcode() == WebAssembly::END_LOOP ||
         MI.getOpcode() == WebAssembly::END_TRY) {
       if (EndToBegin[&MI]->getParent()->getNumber() >= Header->getNumber())
@@ -657,7 +668,251 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
     updateScopeTops(Header, End);
 }
 
+void WebAssemblyCFGStackify::placeTryTableMarker(MachineBasicBlock &MBB) {
+  assert(MBB.isEHPad());
+  MachineFunction &MF = *MBB.getParent();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  const auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>();
+  SortRegionInfo SRI(MLI, WEI);
+  const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+
+  // Compute the nearest common dominator of all unwind predecessors
+  MachineBasicBlock *Header = nullptr;
+  int MBBNumber = MBB.getNumber();
+  for (auto *Pred : MBB.predecessors()) {
+    if (Pred->getNumber() < MBBNumber) {
+      Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred;
+      assert(!explicitlyBranchesTo(Pred, &MBB) &&
+             "Explicit branch to an EH pad!");
+    }
+  }
+  if (!Header)
+    return;
+
+  assert(&MBB != &MF.front() && "Header blocks shouldn't have predecessors");
+  MachineBasicBlock *LayoutPred = MBB.getPrevNode();
+
+  // If the nearest common dominator is inside a more deeply nested context,
+  // walk out to the nearest scope which isn't more deeply nested.
+  for (MachineFunction::iterator I(LayoutPred), E(Header); I != E; --I) {
+    if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) {
+      if (ScopeTop->getNumber() > Header->getNumber()) {
+        // Skip over an intervening scope.
+        I = std::next(ScopeTop->getIterator());
+      } else {
+        // We found a scope level at an appropriate depth.
+        Header = ScopeTop;
+        break;
+      }
+    }
+  }
+
+  // Decide where in Header to put the TRY_TABLE.
+
+  // Instructions that should go before the TRY_TABLE.
+  SmallPtrSet<const MachineInstr *, 4> BeforeSet;
+  // Instructions that should go after the TRY_TABLE.
+  SmallPtrSet<const MachineInstr *, 4> AfterSet;
+  for (const auto &MI : *Header) {
+    // If there is a previously placed LOOP marker and the bottom block of the
+    // loop is above MBB, it should be after the TRY_TABLE, because the loop is
+    // nested in this TRY_TABLE. Otherwise it should be before the TRY_TABLE.
+    if (MI.getOpcode() == WebAssembly::LOOP) {
+      auto *LoopBottom = BeginToEnd[&MI]->getParent()->getPrevNode();
+      if (MBB.getNumber() > LoopBottom->getNumber())
+        AfterSet.insert(&MI);
+#ifndef NDEBUG
+      else
+        BeforeSet.insert(&MI);
+#endif
+    }
+
+    // All previously inserted BLOCK/TRY_TABLE markers should be after the
+    // TRY_TABLE because they are all nested blocks/try_tables.
+    if (MI.getOpcode() == WebAssembly::BLOCK ||
+        MI.getOpcode() == WebAssembly::TRY_TABLE)
+      AfterSet.insert(&MI);
+
+#ifndef NDEBUG
+    // All END_(BLOCK/LOOP/TRY_TABLE) markers should be before the TRY_TABLE.
+    if (MI.getOpcode() == WebAssembly::END_BLOCK ||
+        MI.getOpcode() == WebAssembly::END_LOOP ||
+        MI.getOpcode() == WebAssembly::END_TRY_TABLE)
+      BeforeSet.insert(&MI);
+#endif
+
+    // Terminators should go after the TRY_TABLE.
+    if (MI.isTerminator())
+      AfterSet.insert(&MI);
+  }
+
+  // If Header unwinds to MBB (= Header contains 'invoke'), the try_table block
+  // should contain the call within it. So the call should go after the
+  // TRY_TABLE. The exception is when the header's terminator is a rethrow
+  // instruction, in which case that instruction, not a call instruction before
+  // it, is gonna throw.
+  MachineInstr *ThrowingCall = nullptr;
+  if (MBB.isPredecessor(Header)) {
+    auto TermPos = Header->getFirstTerminator();
+    if (TermPos == Header->end() ||
+        TermPos->getOpcode() != WebAssembly::RETHROW) {
+      for (auto &MI : reverse(*Header)) {
+        if (MI.isCall()) {
+          AfterSet.insert(&MI);
+          ThrowingCall = &MI;
+          // Possibly throwing calls are usually wrapped by EH_LABEL
+          // instructions. We don't want to split them and the call.
+          if (MI.getIterator() != Header->begin() &&
+              std::prev(MI.getIterator())->isEHLabel()) {
+            AfterSet.insert(&*std::prev(MI.getIterator()));
+            ThrowingCall = &*std::prev(MI.getIterator());
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  // Local expression tree should go after the TRY_TABLE.
+  // For BLOCK placement, we start the search from the previous instruction of a
+  // BB's terminator, but in TRY_TABLE's case, we should start from the previous
+  // instruction of a call that can throw, or a EH_LABEL that precedes the call,
+  // because the return values of the call's previous instructions can be
+  // stackified and consumed by the throwing call.
+  auto SearchStartPt = ThrowingCall ? MachineBasicBlock::iterator(ThrowingCall)
+                                    : Header->getFirstTerminator();
+  for (auto I = SearchStartPt, E = Header->begin(); I != E; --I) {
+    if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition())
+      continue;
+    if (WebAssembly::isChild(*std::prev(I), MFI))
+      AfterSet.insert(&*std::prev(I));
+    else
+      break;
+  }
+
+  // Add the TRY_TABLE and a BLOCK for the catch destination. We currently
+  // generate only one CATCH clause for a TRY_TABLE, so we need one BLOCK for
+  // its destination.
+  //
+  // Header:
+  //   block
+  //     try_table (catch ... $MBB)
+  //       ...
+  //
+  // MBB:
+  //     end_try_table
+  //   end_block                 ;; destination of (catch ...)
+  //   ... catch handler body ...
+  auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet);
+  MachineInstrBuilder BlockMIB =
+      BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
+              TII.get(WebAssembly::BLOCK));
+  auto *Block = BlockMIB.getInstr();
+  MachineInstrBuilder TryTableMIB =
+      BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
+              TII.get(WebAssembly::TRY_TABLE))
+          .addImm(int64_t(WebAssembly::BlockType::Void))
+          .addImm(1); // # of catch clauses
+  auto *TryTable = TryTableMIB.getInstr();
+
+  // Add a CATCH_*** clause to the TRY_TABLE. These are pseudo instructions
+  // following the destination END_BLOCK to simulate block return values,
+  // because we currently don't support them.
+  auto *Catch = WebAssembly::findCatch(&MBB);
+  switch (Catch->getOpcode()) {
+  case WebAssembly::CATCH:
+    // CATCH's destination block's return type is the extracted value type,
+    // which is currently i32 for all supported tags.
+    BlockMIB.addImm(int64_t(WebAssembly::BlockType::I32));
+    TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH);
+    for (const auto &Use : Catch->uses()) {
+      // The only use operand a CATCH can have is the tag symbol.
+      TryTableMIB.addExternalSymbol(Use.getSymbolName());
+      break;
+    }
+    TryTableMIB.addMBB(&MBB);
+    break;
+  case WebAssembly::CATCH_REF:
+    // CATCH_REF's destination block's return type is the extracted value type
+    // followed by an exnref, which is (i32, exnref) in our case. We assign the
+    // actual multiavlue signature in MCInstLower. MO_CATCH_BLOCK_SIG signals
+    // that this operand is used for catch_ref's multivalue destination.
+    BlockMIB.addImm(int64_t(WebAssembly::BlockType::Multivalue));
+    Block->getOperand(0).setTargetFlags(WebAssemblyII::MO_CATCH_BLOCK_SIG);
+    TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH_REF);
+    for (const auto &Use : Catch->uses()) {
+      TryTableMIB.addExternalSymbol(Use.getSymbolName());
+      break;
+    }
+    TryTableMIB.addMBB(&MBB);
+    break;
+  case WebAssembly::CATCH_ALL:
+    // CATCH_ALL's destination block's return type is void.
+    BlockMIB.addImm(int64_t(WebAssembly::BlockType::Void));
+    TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH_ALL);
+    TryTableMIB.addMBB(&MBB);
+    break;
+  case WebAssembly::CATCH_ALL_REF:
+    // CATCH_ALL_REF's destination block's return type is exnref.
+    BlockMIB.addImm(int64_t(WebAssembly::BlockType::Exnref));
+    TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH_ALL_REF);
+    TryTableMIB.addMBB(&MBB);
+    break;
+  }
+
+  // Decide where in MBB to put the END_TRY_TABLE, and the END_BLOCK for the
+  // CATCH destination.
+  BeforeSet.clear();
+  AfterSet.clear();
+  for (const auto &MI : MBB) {
+#ifndef NDEBUG
+    // END_TRY_TABLE should precede existing LOOP markers.
+    if (MI.getOpcode() == WebAssembly::LOOP)
+      AfterSet.insert(&MI);
+#endif
+
+    // If there is a previously placed END_LOOP marker and the header of the
+    // loop is above this try_table's header, the END_LOOP should be placed
+    // after the END_TRY_TABLE, because the loop contains this block. Otherwise
+    // the END_LOOP should be placed before the END_TRY_TABLE.
+    if (MI.getOpcode() == WebAssembly::END_LOOP) {
+      if (EndToBegin[&MI]->getParent()->getNumber() >= Header->getNumber())
+        BeforeSet.insert(&MI);
+#ifndef NDEBUG
+      else
+        AfterSet.insert(&MI);
+#endif
+    }
+
+#ifndef NDEBUG
+    // CATCH, CATCH_REF, CATCH_ALL, and CATCH_ALL_REF are pseudo-instructions
+    // that simulate the block return value, so they should be placed after the
+    // END_TRY_TABLE.
+    if (WebAssembly::isCatch(MI.getOpcode()))
+      AfterSet.insert(&MI);
+#endif
+  }
+
+  // Mark the end of the TRY_TABLE and the BLOCK.
+  InsertPos = getEarliestInsertPos(&MBB, BeforeSet, AfterSet);
+  MachineInstr *EndTryTable =
+      BuildMI(MBB, InsertPos, MBB.findPrevDebugLoc(InsertPos),
+              TII.get(WebAssembly::END_TRY_TABLE));
+  registerTryScope(TryTable, EndTryTable, &MBB);
+  MachineInstr *EndBlock =
+      BuildMI(MBB, InsertPos, MBB.findPrevDebugLoc(InsertPos),
+              TII.get(WebAssembly::END_BLOCK));
+  registerScope(Block, EndBlock);
+  // Track the farthest-spanning scope that ends at this point.
+  updateScopeTops(Header, &MBB);
+}
+
 void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
+  if (WebAssembly::WasmEnableExnref)
+    return;
+
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
 
   // When there is an unconditional branch right before a catch instruction and
@@ -1445,6 +1700,7 @@ void WebAssemblyCFGStackify::recalculateScopeTops(MachineFunction &MF) {
       case WebAssembly::END_BLOCK:
       case WebAssembly::END_LOOP:
       case WebAssembly::END_TRY:
+      case WebAssembly::END_TRY_TABLE:
       case WebAssembly::DELEGATE:
         updateScopeTops(EndToBegin[&MI]->getParent(), &MBB);
         break;
@@ -1502,6 +1758,7 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) {
       }
       case WebAssembly::END_BLOCK:
       case WebAssembly::END_LOOP:
+      case WebAssembly::END_TRY_TABLE:
       case WebAssembly::DELEGATE:
         EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType));
         continue;
@@ -1528,7 +1785,7 @@ static void appendEndToFunction(MachineFunction &MF,
           TII.get(WebAssembly::END_FUNCTION));
 }
 
-/// Insert BLOCK/LOOP/TRY markers at appropriate places.
+/// Insert BLOCK/LOOP/TRY/TRY_TABLE markers at appropriate places.
 void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) {
   // We allocate one more than the number of blocks in the function to
   // accommodate for the possible fake block we may insert at the end.
@@ -1540,15 +1797,25 @@ void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) {
   const MCAsmInfo *MCAI = MF.getTarget().getMCAsmInfo();
   for (auto &MBB : MF) {
     if (MBB.isEHPad()) {
-      // Place the TRY for MBB if MBB is the EH pad of an exception.
+      // Place the TRY/TRY_TABLE for MBB if MBB is the EH pad of an exception.
       if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
-          MF.getFunction().hasPersonalityFn())
-        placeTryMarker(MBB);
+          MF.getFunction().hasPersonalityFn()) {
+        if (WebAssembly::WasmEnableExnref)
+          placeTryTableMarker(MBB);
+        else
+          placeTryMarker(MBB);
+      }
     } else {
       // Place the BLOCK for MBB if MBB is branched to from above.
       placeBlockMarker(MBB);
     }
   }
+
+  // FIXME We return here temporarily until we implement fixing unwind
+  // mismatches for the new exnref proposal.
+  if (WebAssembly::WasmEnableExnref)
+    return;
+
   // Fix mismatches in unwind destinations induced by linearizing the code.
   if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
       MF.getFunction().hasPersonalityFn()) {
@@ -1668,11 +1935,14 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
   for (auto &MBB : reverse(MF)) {
     for (MachineInstr &MI : llvm::reverse(MBB)) {
       switch (MI.getOpcode()) {
+      case WebAssembly::TRY_TABLE:
+        RewriteOperands(MI);
+        [[fallthrough]];
       case WebAssembly::BLOCK:
       case WebAssembly::TRY:
         assert(ScopeTops[Stack.back().first->getNumber()]->getNumber() <=
                    MBB.getNumber() &&
-               "Block/try marker should be balanced");
+               "Block/try/try_table marker should be balanced");
         Stack.pop_back();
         break;
 
@@ -1687,6 +1957,7 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
         [[fallthrough]];
       }
       case WebAssembly::END_BLOCK:
+      case WebAssembly::END_TRY_TABLE:
         Stack.push_back(std::make_pair(&MBB, &MI));
         break;
 
@@ -1744,7 +2015,8 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   // Liveness is not tracked for VALUE_STACK physreg.
   MF.getRegInfo().invalidateLiveness();
 
-  // Place the BLOCK/LOOP/TRY markers to indicate the beginnings of scopes.
+  // Place the BLOCK/LOOP/TRY/TRY_TABLE markers to indicate the beginnings of
+  // scopes.
   placeMarkers(MF);
 
   // Remove unnecessary instructions possibly introduced by try/end_trys.
@@ -1755,8 +2027,8 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   // Convert MBB operands in terminators to relative depth immediates.
   rewriteDepthImmediates(MF);
 
-  // Fix up block/loop/try signatures at the end of the function to conform to
-  // WebAssembly's rules.
+  // Fix up block/loop/try/try_table signatures at the end of the function to
+  // conform to WebAssembly's rules.
   fixEndsAtEndOfFunction(MF);
 
   // Add an end instruction at the end of the function body.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 60c5e18fbb0cd..b5b9cbeacfa18 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -211,8 +211,11 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::wasm_catch: {
       int Tag = Node->getConstantOperandVal(2);
       SDValue SymNode = getTagSymNode(Tag, CurDAG);
+      unsigned CatchOpcode = WebAssembly::WasmEnableExnref
+                                 ? WebAssembly::CATCH
+                                 : WebAssembly::CATCH_LEGACY;
       MachineSDNode *Catch =
-          CurDAG->getMachineNode(WebAssembly::CATCH_LEGACY, DL,
+          CurDAG->getMachineNode(CatchOpcode, DL,
                                  {
                                      PtrVT,     // exception pointer
                                      MVT::Other // outchain type
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 05880b89d1fbc..97ff6d77f54b1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -125,15 +125,46 @@ defm DEBUG_UNREACHABLE : NRI<(outs), (ins), [(debugtrap)], "unreachable", 0x00>;
 // Exception handling instructions
 //===----------------------------------------------------------------------===//
 
+// A list of catch clauses attached to try_table.
+def CatchListAsmOperand : AsmOperandClass { let Name = "CatchList"; }
+let OperandNamespace = "WebAssembly", OperandType = "OPERAND_CATCH_LIST" in
+def catch_list : Operand<i32> {
+  let ParserMatchClass = CatchListAsmOperand;
+  let PrintMethod = "printCatchList";
+}
+
 let Predicates = [HasExceptionHandling] in {
 
-// Throwing an exception: throw
+// Throwing an exception: throw / throw_ref
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 defm THROW : I<(outs), (ins tag_op:$tag, variable_ops),
                (outs), (ins tag_op:$tag), [],
                "throw   \t$tag", "throw   \t$tag", 0x08>;
+defm THROW_REF : I<(outs), (ins EXNREF:$exn), (outs), (ins), [],
+                   "throw_ref \t$exn", "throw_ref", 0x0a>;
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
+// Region within which an exception is caught: try / end_try
+let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
+defm TRY_TABLE : I<(outs), (ins Signature:$sig, variable_ops),
+                   (outs), (ins Signature:$sig, catch_list:$cal), [],
+                   "try_table \t$sig", "try_table \t$sig $cal", 0x1f>;
+defm END_TRY_TABLE : NRI<(outs), (ins), [], "end_try_table", 0x0b>;
+} // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
+
+// Pseudo instructions that represent catch / catch_ref / catch_all /
+// catch_all_ref clauses in a try_table instruction.
+let hasCtrlDep = 1, hasSideEffects = 1, isCodeGenOnly = 1 in {
+let variadicOpsAreDefs = 1 in {
+defm CATCH : I<(outs), (ins tag_op:$tag, variable_ops),
+               (outs), (ins tag_op:$tag), []>;
+defm CATCH_REF : I<(outs), (ins tag_op:$tag, variable_ops),
+                   (outs), (ins tag_op:$tag), []>;
+}
+defm CATCH_ALL : NRI<(outs), (ins), []>;
+defm CATCH_ALL_REF : I<(outs EXNREF:$dst), (ins), (outs), (ins), []>;
+}
+
 // Pseudo instructions: cleanupret / catchret
 let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
     isPseudo = 1, isEHScopeReturn = 1 in {
@@ -147,9 +178,10 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
 // usage gets low enough.
 
 // Rethrowing an exception: rethrow
+// The new exnref proposal also uses this instruction as an interim pseudo
+// instruction before we convert it to a THROW_REF.
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in
 defm RETHROW : NRI<(outs), (ins i32imm:$depth), [], "rethrow \t$depth", 0x09>;
-// isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 // The depth argument will be computed in CFGStackify. We set it to 0 here for
 // now.
 def : Pat<(int_wasm_rethrow), (RETHROW 0)>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index f0c205cdb6aeb..70b406b6552bf 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -37,6 +37,7 @@ class WebAssemblyLateEHPrepare final : public MachineFunctionPass {
   void recordCatchRetBBs(MachineFunction &MF);
   bool hoistCatches(MachineFunction &MF);
   bool addCatchAlls(MachineFunction &MF);
+  bool addCatchRefsAndThrowRefs(MachineFunction &MF);
   bool replaceFuncletReturns(MachineFunction &MF);
   bool removeUnnecessaryUnreachables(MachineFunction &MF);
   bool restoreStackPointer(MachineFunction &MF);
@@ -127,6 +128,8 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
     Changed |= hoistCatches(MF);
     Changed |= addCatchAlls(MF);
     Changed |= replaceFuncletReturns(MF);
+    if (WebAssembly::WasmEnableExnref)
+      Changed |= addCatchRefsAndThrowRefs(MF);
   }
   Changed |= removeUnnecessaryUnreachables(MF);
   if (MF.getFunction().hasPersonalityFn())
@@ -214,9 +217,12 @@ bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) {
     if (InsertPos == MBB.end() ||
         !WebAssembly::isCatch(InsertPos->getOpcode())) {
       Changed = true;
+      unsigned CatchAllOpcode = WebAssembly::WasmEnableExnref
+                                    ? WebAssembly::CATCH_ALL
+                                    : WebAssembly::CATCH_ALL_LEGACY;
       BuildMI(MBB, InsertPos,
               InsertPos == MBB.end() ? DebugLoc() : InsertPos->getDebugLoc(),
-              TII.get(WebAssembly::CATCH_ALL_LEGACY));
+              TII.get(CatchAllOpcode));
     }
   }
   return Changed;
@@ -248,6 +254,10 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
     case WebAssembly::CLEANUPRET: {
       // Replace a cleanupret with a rethrow. For C++ support, currently
       // rethrow's immediate argument is always 0 (= the latest exception).
+      //
+      // Even when -wasm-enable-exnref is true, we use a RETHROW here for the
+      // moment. This will be converted to a THROW_REF in
+      // addCatchRefsAndThrowRefs.
       BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW))
           .addImm(0);
       TI->eraseFromParent();
@@ -259,14 +269,74 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
   return Changed;
 }
 
-// Remove unnecessary unreachables after a throw or rethrow.
+// Add CATCH_REF and CATCH_ALL_REF pseudo instructions to EH pads, and convert
+// RETHROWs to THROW_REFs.
+bool WebAssemblyLateEHPrepare::addCatchRefsAndThrowRefs(MachineFunction &MF) {
+  bool Changed = false;
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
+  DenseMap<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> EHPadToRethrows;
+
+  // Create a map of <EH pad, a vector of RETHROWs rethrowing its exception>
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (MI.getOpcode() == WebAssembly::RETHROW) {
+        Changed = true;
+        auto *EHPad = getMatchingEHPad(&MI);
+        EHPadToRethrows[EHPad].push_back(&MI);
+      }
+    }
+  }
+
+  // Convert CATCH into CATCH_REF and CATCH_ALL into CATCH_ALL_REF, when the
+  // caught exception is rethrown. And convert RETHROWs to THROW_REFs.
+  for (auto &[EHPad, Rethrows] : EHPadToRethrows) {
+    auto *Catch = WebAssembly::findCatch(EHPad);
+    auto *InsertPos = Catch->getIterator()->getNextNode();
+    auto ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
+    if (Catch->getOpcode() == WebAssembly::CATCH) {
+      MachineInstrBuilder MIB = BuildMI(*EHPad, InsertPos, Catch->getDebugLoc(),
+                                        TII.get(WebAssembly::CATCH_REF));
+      // Copy defs (= extracted values) from the old CATCH to the new CATCH_REF
+      for (const auto &Def : Catch->defs())
+        MIB.addDef(Def.getReg());
+      MIB.addDef(ExnReg); // Attach the exnref def after extracted values
+      // Copy the tag symbol (The only use operand a CATCH can have is the tag
+      // symbol)
+      for (const auto &Use : Catch->uses()) {
+        MIB.addExternalSymbol(Use.getSymbolName());
+        break;
+      }
+    } else if (Catch->getOpcode() == WebAssembly::CATCH_ALL) {
+      BuildMI(*EHPad, InsertPos, Catch->getDebugLoc(),
+              TII.get(WebAssembly::CATCH_ALL_REF))
+          .addDef(ExnReg);
+    } else {
+      assert(false);
+    }
+    Catch->eraseFromParent();
+
+    for (auto *Rethrow : Rethrows) {
+      auto InsertPos = std::next(Rethrow->getIterator());
+      BuildMI(*Rethrow->getParent(), InsertPos, Rethrow->getDebugLoc(),
+              TII.get(WebAssembly::THROW_REF))
+          .addReg(ExnReg);
+      Rethrow->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
+// Remove unnecessary unreachables after a throw/rethrow/throw_ref.
 bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
     MachineFunction &MF) {
   bool Changed = false;
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
       if (MI.getOpcode() != WebAssembly::THROW &&
-          MI.getOpcode() != WebAssembly::RETHROW)
+          MI.getOpcode() != WebAssembly::RETHROW &&
+          MI.getOpcode() != WebAssembly::THROW_REF)
         continue;
       Changed = true;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 431dc7f33ac89..73ff50f39b020 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssemblyMCInstLower.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "TargetInfo/WebAssemblyTargetInfo.h"
 #include "Utils/WebAssemblyTypeUtilities.h"
 #include "WebAssemblyAsmPrinter.h"
@@ -220,12 +221,27 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
 
           MCOp = lowerTypeIndexOperand(std::move(Returns), std::move(Params));
           break;
-        } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
+        }
+        if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
           auto BT = static_cast<WebAssembly::BlockType>(MO.getImm());
           assert(BT != WebAssembly::BlockType::Invalid);
           if (BT == WebAssembly::BlockType::Multivalue) {
-            SmallVector<wasm::ValType, 1> Returns;
-            getFunctionReturns(MI, Returns);
+            SmallVector<wasm::ValType, 2> Returns;
+            // Multivalue blocks are emitted in two cases:
+            // 1. When the blocks will never be exited and are at the ends of
+            //    functions (see
+            //    WebAssemblyCFGStackify::fixEndsAtEndOfFunction). In this case
+            //    the exact multivalue signature can always be inferred from the
+            //    return type of the parent function.
+            // 2. (catch_ref ...) clause in try_table instruction. Currently all
+            //    tags we support (cpp_exception and c_longjmp) throws a single
+            //    i32, so the multivalue signature for this case will be (i32,
+            //    exnref). Having MO_CATCH_BLOCK_SIG target flags means this is
+            //    a destination of a catch_ref.
+            if (MO.getTargetFlags() == WebAssemblyII::MO_CATCH_BLOCK_SIG)
+              Returns = {wasm::ValType::I32, wasm::ValType::EXNREF};
+            else
+              getFunctionReturns(MI, Returns);
             MCOp = lowerTypeIndexOperand(std::move(Returns),
                                          SmallVector<wasm::ValType, 4>());
             break;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index c5a047ee47d73..ed186e65a80cf 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -43,6 +43,8 @@ bool WebAssembly::mayThrow(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   case WebAssembly::THROW:
   case WebAssembly::THROW_S:
+  case WebAssembly::THROW_REF:
+  case WebAssembly::THROW_REF_S:
   case WebAssembly::RETHROW:
   case WebAssembly::RETHROW_S:
     return true;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a1d466eee691c..d0794cb9bfde3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56096,34 +56096,50 @@ static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
   if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
     return SDValue();
 
-  X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
-  if (CC != X86::COND_S && CC != X86::COND_NS)
-    return SDValue();
-
-  // Condition should come from a negate operation.
   SDValue Cond = N1.getOperand(3);
-  if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
+  if (Cond.getOpcode() != X86ISD::SUB)
     return SDValue();
   assert(Cond.getResNo() == 1 && "Unexpected result number");
 
-  // Get the X and -X from the negate.
-  SDValue NegX = Cond.getValue(0);
-  SDValue X = Cond.getOperand(1);
-
   SDValue FalseOp = N1.getOperand(0);
   SDValue TrueOp = N1.getOperand(1);
+  X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
+  MVT VT = N->getSimpleValueType(0);
+  SDLoc DL(N);
 
-  // Cmov operands should be X and NegX. Order doesn't matter.
-  if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
-    return SDValue();
+  // ABS condition should come from a negate operation.
+  if ((CC == X86::COND_S || CC == X86::COND_NS) &&
+      isNullConstant(Cond.getOperand(0))) {
+    // Get the X and -X from the negate.
+    SDValue NegX = Cond.getValue(0);
+    SDValue X = Cond.getOperand(1);
 
-  // Build a new CMOV with the operands swapped.
-  SDLoc DL(N);
-  MVT VT = N->getSimpleValueType(0);
-  SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
-                             N1.getOperand(2), Cond);
-  // Convert sub to add.
-  return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
+    // Cmov operands should be X and NegX. Order doesn't matter.
+    if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
+      return SDValue();
+
+    // Build a new CMOV with the operands swapped.
+    SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
+                               N1.getOperand(2), Cond);
+    // Convert sub to add.
+    return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
+  }
+
+  // Handle ABD special case:
+  // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
+  // ABD condition should come from a pair of matching subtracts.
+  if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
+      (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
+      (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
+      (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
+      (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
+      (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
+    // Build a new CMOV with the operands swapped.
+    return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
+                       Cond);
+  }
+
+  return SDValue();
 }
 
 static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
diff --git a/llvm/lib/Transforms/Coroutines/CMakeLists.txt b/llvm/lib/Transforms/Coroutines/CMakeLists.txt
index c6508174a7f10..46ef5cd4e8cfe 100644
--- a/llvm/lib/Transforms/Coroutines/CMakeLists.txt
+++ b/llvm/lib/Transforms/Coroutines/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_component_library(LLVMCoroutines
   CoroSplit.cpp
   SuspendCrossingInfo.cpp
   SpillUtils.cpp
+  MaterializationUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Coroutines
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 4b76fc7936100..b74c9f01cd239 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -16,10 +16,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "CoroInternal.h"
+#include "MaterializationUtils.h"
 #include "SpillUtils.h"
 #include "SuspendCrossingInfo.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/StackLifetime.h"
@@ -36,135 +36,12 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
-#include <deque>
 #include <optional>
 
 using namespace llvm;
 
 extern cl::opt<bool> UseNewDbgInfoFormat;
 
-// The "coro-suspend-crossing" flag is very noisy. There is another debug type,
-// "coro-frame", which results in leaner debug spew.
-#define DEBUG_TYPE "coro-suspend-crossing"
-
-namespace {
-
-// RematGraph is used to construct a DAG for rematerializable instructions
-// When the constructor is invoked with a candidate instruction (which is
-// materializable) it builds a DAG of materializable instructions from that
-// point.
-// Typically, for each instruction identified as re-materializable across a
-// suspend point, a RematGraph will be created.
-struct RematGraph {
-  // Each RematNode in the graph contains the edges to instructions providing
-  // operands in the current node.
-  struct RematNode {
-    Instruction *Node;
-    SmallVector<RematNode *> Operands;
-    RematNode() = default;
-    RematNode(Instruction *V) : Node(V) {}
-  };
-
-  RematNode *EntryNode;
-  using RematNodeMap =
-      SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>;
-  RematNodeMap Remats;
-  const std::function<bool(Instruction &)> &MaterializableCallback;
-  SuspendCrossingInfo &Checker;
-
-  RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback,
-             Instruction *I, SuspendCrossingInfo &Checker)
-      : MaterializableCallback(MaterializableCallback), Checker(Checker) {
-    std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I);
-    EntryNode = FirstNode.get();
-    std::deque<std::unique_ptr<RematNode>> WorkList;
-    addNode(std::move(FirstNode), WorkList, cast<User>(I));
-    while (WorkList.size()) {
-      std::unique_ptr<RematNode> N = std::move(WorkList.front());
-      WorkList.pop_front();
-      addNode(std::move(N), WorkList, cast<User>(I));
-    }
-  }
-
-  void addNode(std::unique_ptr<RematNode> NUPtr,
-               std::deque<std::unique_ptr<RematNode>> &WorkList,
-               User *FirstUse) {
-    RematNode *N = NUPtr.get();
-    if (Remats.count(N->Node))
-      return;
-
-    // We haven't see this node yet - add to the list
-    Remats[N->Node] = std::move(NUPtr);
-    for (auto &Def : N->Node->operands()) {
-      Instruction *D = dyn_cast<Instruction>(Def.get());
-      if (!D || !MaterializableCallback(*D) ||
-          !Checker.isDefinitionAcrossSuspend(*D, FirstUse))
-        continue;
-
-      if (Remats.count(D)) {
-        // Already have this in the graph
-        N->Operands.push_back(Remats[D].get());
-        continue;
-      }
-
-      bool NoMatch = true;
-      for (auto &I : WorkList) {
-        if (I->Node == D) {
-          NoMatch = false;
-          N->Operands.push_back(I.get());
-          break;
-        }
-      }
-      if (NoMatch) {
-        // Create a new node
-        std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D);
-        N->Operands.push_back(ChildNode.get());
-        WorkList.push_back(std::move(ChildNode));
-      }
-    }
-  }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  static std::string getBasicBlockLabel(const BasicBlock *BB) {
-    if (BB->hasName())
-      return BB->getName().str();
-
-    std::string S;
-    raw_string_ostream OS(S);
-    BB->printAsOperand(OS, false);
-    return OS.str().substr(1);
-  }
-
-  void dump() const {
-    dbgs() << "Entry (";
-    dbgs() << getBasicBlockLabel(EntryNode->Node->getParent());
-    dbgs() << ") : " << *EntryNode->Node << "\n";
-    for (auto &E : Remats) {
-      dbgs() << *(E.first) << "\n";
-      for (RematNode *U : E.second->Operands)
-        dbgs() << "  " << *U->Node << "\n";
-    }
-  }
-#endif
-};
-} // end anonymous namespace
-
-namespace llvm {
-
-template <> struct GraphTraits<RematGraph *> {
-  using NodeRef = RematGraph::RematNode *;
-  using ChildIteratorType = RematGraph::RematNode **;
-
-  static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; }
-  static ChildIteratorType child_begin(NodeRef N) {
-    return N->Operands.begin();
-  }
-  static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }
-};
-
-} // end namespace llvm
-
-#undef DEBUG_TYPE // "coro-suspend-crossing"
 #define DEBUG_TYPE "coro-frame"
 
 namespace {
@@ -268,15 +145,6 @@ static void dumpSpills(StringRef Title, const coro::SpillInfo &Spills) {
       I->dump();
   }
 }
-static void dumpRemats(
-    StringRef Title,
-    const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) {
-  dbgs() << "------------- " << Title << "--------------\n";
-  for (const auto &E : RM) {
-    E.second->dump();
-    dbgs() << "--\n";
-  }
-}
 
 static void dumpAllocas(const SmallVectorImpl<coro::AllocaInfo> &Allocas) {
   dbgs() << "------------- Allocas --------------\n";
@@ -1634,93 +1502,6 @@ static void rewritePHIs(Function &F) {
     rewritePHIs(*BB);
 }
 
-/// Default materializable callback
-// Check for instructions that we can recreate on resume as opposed to spill
-// the result into a coroutine frame.
-bool coro::defaultMaterializable(Instruction &V) {
-  return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) ||
-          isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V));
-}
-
-// For each instruction identified as materializable across the suspend point,
-// and its associated DAG of other rematerializable instructions,
-// recreate the DAG of instructions after the suspend point.
-static void rewriteMaterializableInstructions(
-    const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8>
-        &AllRemats) {
-  // This has to be done in 2 phases
-  // Do the remats and record the required defs to be replaced in the
-  // original use instructions
-  // Once all the remats are complete, replace the uses in the final
-  // instructions with the new defs
-  typedef struct {
-    Instruction *Use;
-    Instruction *Def;
-    Instruction *Remat;
-  } ProcessNode;
-
-  SmallVector<ProcessNode> FinalInstructionsToProcess;
-
-  for (const auto &E : AllRemats) {
-    Instruction *Use = E.first;
-    Instruction *CurrentMaterialization = nullptr;
-    RematGraph *RG = E.second.get();
-    ReversePostOrderTraversal<RematGraph *> RPOT(RG);
-    SmallVector<Instruction *> InstructionsToProcess;
-
-    // If the target use is actually a suspend instruction then we have to
-    // insert the remats into the end of the predecessor (there should only be
-    // one). This is so that suspend blocks always have the suspend instruction
-    // as the first instruction.
-    auto InsertPoint = &*Use->getParent()->getFirstInsertionPt();
-    if (isa<AnyCoroSuspendInst>(Use)) {
-      BasicBlock *SuspendPredecessorBlock =
-          Use->getParent()->getSinglePredecessor();
-      assert(SuspendPredecessorBlock && "malformed coro suspend instruction");
-      InsertPoint = SuspendPredecessorBlock->getTerminator();
-    }
-
-    // Note: skip the first instruction as this is the actual use that we're
-    // rematerializing everything for.
-    auto I = RPOT.begin();
-    ++I;
-    for (; I != RPOT.end(); ++I) {
-      Instruction *D = (*I)->Node;
-      CurrentMaterialization = D->clone();
-      CurrentMaterialization->setName(D->getName());
-      CurrentMaterialization->insertBefore(InsertPoint);
-      InsertPoint = CurrentMaterialization;
-
-      // Replace all uses of Def in the instructions being added as part of this
-      // rematerialization group
-      for (auto &I : InstructionsToProcess)
-        I->replaceUsesOfWith(D, CurrentMaterialization);
-
-      // Don't replace the final use at this point as this can cause problems
-      // for other materializations. Instead, for any final use that uses a
-      // define that's being rematerialized, record the replace values
-      for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i)
-        if (Use->getOperand(i) == D) // Is this operand pointing to oldval?
-          FinalInstructionsToProcess.push_back(
-              {Use, D, CurrentMaterialization});
-
-      InstructionsToProcess.push_back(CurrentMaterialization);
-    }
-  }
-
-  // Finally, replace the uses with the defines that we've just rematerialized
-  for (auto &R : FinalInstructionsToProcess) {
-    if (auto *PN = dyn_cast<PHINode>(R.Use)) {
-      assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
-                                                "values in the PHINode");
-      PN->replaceAllUsesWith(R.Remat);
-      PN->eraseFromParent();
-      continue;
-    }
-    R.Use->replaceUsesOfWith(R.Def, R.Remat);
-  }
-}
-
 // Splits the block at a particular instruction unless it is the first
 // instruction in the block with a single predecessor.
 static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) {
@@ -1741,10 +1522,6 @@ static void splitAround(Instruction *I, const Twine &Name) {
   splitBlockIfNotFirst(I->getNextNode(), "After" + Name);
 }
 
-static bool isSuspendBlock(BasicBlock *BB) {
-  return isa<AnyCoroSuspendInst>(BB->front());
-}
-
 /// After we split the coroutine, will the given basic block be along
 /// an obvious exit path for the resumption function?
 static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB,
@@ -1754,7 +1531,8 @@ static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB,
   if (depth == 0) return false;
 
   // If this is a suspend block, we're about to exit the resumption function.
-  if (isSuspendBlock(BB)) return true;
+  if (coro::isSuspendBlock(BB))
+    return true;
 
   // Recurse into the successors.
   for (auto *Succ : successors(BB)) {
@@ -1994,7 +1772,8 @@ static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
   DomSet.insert(&F.getEntryBlock());
   for (auto *CSI : Shape.CoroSuspends) {
     BasicBlock *SuspendBlock = CSI->getParent();
-    assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() &&
+    assert(coro::isSuspendBlock(SuspendBlock) &&
+           SuspendBlock->getSingleSuccessor() &&
            "should have split coro.suspend into its own block");
     DomSet.insert(SuspendBlock->getSingleSuccessor());
   }
@@ -2226,71 +2005,8 @@ void coro::salvageDebugInfo(
   }
 }
 
-static void doRematerializations(
-    Function &F, SuspendCrossingInfo &Checker,
-    const std::function<bool(Instruction &)> &MaterializableCallback) {
-  if (F.hasOptNone())
-    return;
-
-  coro::SpillInfo Spills;
-
-  // See if there are materializable instructions across suspend points
-  // We record these as the starting point to also identify materializable
-  // defs of uses in these operations
-  for (Instruction &I : instructions(F)) {
-    if (!MaterializableCallback(I))
-      continue;
-    for (User *U : I.users())
-      if (Checker.isDefinitionAcrossSuspend(I, U))
-        Spills[&I].push_back(cast<Instruction>(U));
-  }
-
-  // Process each of the identified rematerializable instructions
-  // and add predecessor instructions that can also be rematerialized.
-  // This is actually a graph of instructions since we could potentially
-  // have multiple uses of a def in the set of predecessor instructions.
-  // The approach here is to maintain a graph of instructions for each bottom
-  // level instruction - where we have a unique set of instructions (nodes)
-  // and edges between them. We then walk the graph in reverse post-dominator
-  // order to insert them past the suspend point, but ensure that ordering is
-  // correct. We also rely on CSE removing duplicate defs for remats of
-  // different instructions with a def in common (rather than maintaining more
-  // complex graphs for each suspend point)
-
-  // We can do this by adding new nodes to the list for each suspend
-  // point. Then using standard GraphTraits to give a reverse post-order
-  // traversal when we insert the nodes after the suspend
-  SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats;
-  for (auto &E : Spills) {
-    for (Instruction *U : E.second) {
-      // Don't process a user twice (this can happen if the instruction uses
-      // more than one rematerializable def)
-      if (AllRemats.count(U))
-        continue;
-
-      // Constructor creates the whole RematGraph for the given Use
-      auto RematUPtr =
-          std::make_unique<RematGraph>(MaterializableCallback, U, Checker);
-
-      LLVM_DEBUG(dbgs() << "***** Next remat group *****\n";
-                 ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get());
-                 for (auto I = RPOT.begin(); I != RPOT.end();
-                      ++I) { (*I)->Node->dump(); } dbgs()
-                 << "\n";);
-
-      AllRemats[U] = std::move(RematUPtr);
-    }
-  }
-
-  // Rewrite materializable instructions to be materialized at the use
-  // point.
-  LLVM_DEBUG(dumpRemats("Materializations", AllRemats));
-  rewriteMaterializableInstructions(AllRemats);
-}
-
-void coro::buildCoroutineFrame(
-    Function &F, Shape &Shape, TargetTransformInfo &TTI,
-    const std::function<bool(Instruction &)> &MaterializableCallback) {
+void coro::normalizeCoroutine(Function &F, coro::Shape &Shape,
+                              TargetTransformInfo &TTI) {
   // Don't eliminate swifterror in async functions that won't be split.
   if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty())
     eliminateSwiftError(F, Shape);
@@ -2324,8 +2040,8 @@ void coro::buildCoroutineFrame(
       IRBuilder<> Builder(AsyncEnd);
       SmallVector<Value *, 8> Args(AsyncEnd->args());
       auto Arguments = ArrayRef<Value *>(Args).drop_front(3);
-      auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn,
-                                      TTI, Arguments, Builder);
+      auto *Call = coro::createMustTailCall(
+          AsyncEnd->getDebugLoc(), MustTailCallFn, TTI, Arguments, Builder);
       splitAround(Call, "MustTailCall.Before.CoroEnd");
     }
   }
@@ -2337,10 +2053,12 @@ void coro::buildCoroutineFrame(
   // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will
   // never have its definition separated from the PHI by the suspend point.
   rewritePHIs(F);
+}
 
-  // Build suspend crossing info.
+void coro::buildCoroutineFrame(
+    Function &F, Shape &Shape,
+    const std::function<bool(Instruction &)> &MaterializableCallback) {
   SuspendCrossingInfo Checker(F, Shape.CoroSuspends, Shape.CoroEnds);
-
   doRematerializations(F, Checker, MaterializableCallback);
 
   const DominatorTree DT(F);
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index be86f96525b67..891798f53b2d0 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -21,6 +21,7 @@ class CallGraph;
 
 namespace coro {
 
+bool isSuspendBlock(BasicBlock *BB);
 bool declaresAnyIntrinsic(const Module &M);
 bool declaresIntrinsics(const Module &M,
                         const std::initializer_list<StringRef>);
@@ -281,8 +282,10 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
 };
 
 bool defaultMaterializable(Instruction &V);
+void normalizeCoroutine(Function &F, coro::Shape &Shape,
+                        TargetTransformInfo &TTI);
 void buildCoroutineFrame(
-    Function &F, Shape &Shape, TargetTransformInfo &TTI,
+    Function &F, Shape &Shape,
     const std::function<bool(Instruction &)> &MaterializableCallback);
 CallInst *createMustTailCall(DebugLoc Loc, Function *MustTailCallFn,
                              TargetTransformInfo &TTI,
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 494c4d632de95..dc3829d7f28eb 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -2030,7 +2030,8 @@ splitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones,
   lowerAwaitSuspends(F, Shape);
 
   simplifySuspendPoints(Shape);
-  buildCoroutineFrame(F, Shape, TTI, MaterializableCallback);
+  normalizeCoroutine(F, Shape, TTI);
+  buildCoroutineFrame(F, Shape, MaterializableCallback);
   replaceFrameSizeAndAlignment(Shape);
   bool isNoSuspendCoroutine = Shape.CoroSuspends.empty();
 
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index be257339e0ac4..cdc442bc819c3 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -100,6 +100,10 @@ static bool isCoroutineIntrinsicName(StringRef Name) {
 }
 #endif
 
+bool coro::isSuspendBlock(BasicBlock *BB) {
+  return isa<AnyCoroSuspendInst>(BB->front());
+}
+
 bool coro::declaresAnyIntrinsic(const Module &M) {
   for (StringRef Name : CoroIntrinsics) {
     assert(isCoroutineIntrinsicName(Name) && "not a coroutine intrinsic");
diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
new file mode 100644
index 0000000000000..708e8734175f9
--- /dev/null
+++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
@@ -0,0 +1,308 @@
+//===- MaterializationUtils.cpp - Builds and manipulates coroutine frame
+//-------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file contains classes used to materialize insts after suspends points.
+//===----------------------------------------------------------------------===//
+
+#include "MaterializationUtils.h"
+#include "SpillUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include <deque>
+
+using namespace llvm;
+
+using namespace coro;
+
+// The "coro-suspend-crossing" flag is very noisy. There is another debug type,
+// "coro-frame", which results in leaner debug spew.
+#define DEBUG_TYPE "coro-suspend-crossing"
+
+namespace {
+
+// RematGraph is used to construct a DAG for rematerializable instructions
+// When the constructor is invoked with a candidate instruction (which is
+// materializable) it builds a DAG of materializable instructions from that
+// point.
+// Typically, for each instruction identified as re-materializable across a
+// suspend point, a RematGraph will be created.
+struct RematGraph {
+  // Each RematNode in the graph contains the edges to instructions providing
+  // operands in the current node.
+  struct RematNode {
+    Instruction *Node;
+    SmallVector<RematNode *> Operands;
+    RematNode() = default;
+    RematNode(Instruction *V) : Node(V) {}
+  };
+
+  RematNode *EntryNode;
+  using RematNodeMap =
+      SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>;
+  RematNodeMap Remats;
+  const std::function<bool(Instruction &)> &MaterializableCallback;
+  SuspendCrossingInfo &Checker;
+
+  RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback,
+             Instruction *I, SuspendCrossingInfo &Checker)
+      : MaterializableCallback(MaterializableCallback), Checker(Checker) {
+    std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I);
+    EntryNode = FirstNode.get();
+    std::deque<std::unique_ptr<RematNode>> WorkList;
+    addNode(std::move(FirstNode), WorkList, cast<User>(I));
+    while (WorkList.size()) {
+      std::unique_ptr<RematNode> N = std::move(WorkList.front());
+      WorkList.pop_front();
+      addNode(std::move(N), WorkList, cast<User>(I));
+    }
+  }
+
+  void addNode(std::unique_ptr<RematNode> NUPtr,
+               std::deque<std::unique_ptr<RematNode>> &WorkList,
+               User *FirstUse) {
+    RematNode *N = NUPtr.get();
+    if (Remats.count(N->Node))
+      return;
+
+    // We haven't see this node yet - add to the list
+    Remats[N->Node] = std::move(NUPtr);
+    for (auto &Def : N->Node->operands()) {
+      Instruction *D = dyn_cast<Instruction>(Def.get());
+      if (!D || !MaterializableCallback(*D) ||
+          !Checker.isDefinitionAcrossSuspend(*D, FirstUse))
+        continue;
+
+      if (Remats.count(D)) {
+        // Already have this in the graph
+        N->Operands.push_back(Remats[D].get());
+        continue;
+      }
+
+      bool NoMatch = true;
+      for (auto &I : WorkList) {
+        if (I->Node == D) {
+          NoMatch = false;
+          N->Operands.push_back(I.get());
+          break;
+        }
+      }
+      if (NoMatch) {
+        // Create a new node
+        std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D);
+        N->Operands.push_back(ChildNode.get());
+        WorkList.push_back(std::move(ChildNode));
+      }
+    }
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  static std::string getBasicBlockLabel(const BasicBlock *BB) {
+    if (BB->hasName())
+      return BB->getName().str();
+
+    std::string S;
+    raw_string_ostream OS(S);
+    BB->printAsOperand(OS, false);
+    return OS.str().substr(1);
+  }
+
+  void dump() const {
+    dbgs() << "Entry (";
+    dbgs() << getBasicBlockLabel(EntryNode->Node->getParent());
+    dbgs() << ") : " << *EntryNode->Node << "\n";
+    for (auto &E : Remats) {
+      dbgs() << *(E.first) << "\n";
+      for (RematNode *U : E.second->Operands)
+        dbgs() << "  " << *U->Node << "\n";
+    }
+  }
+#endif
+};
+
+} // namespace
+
+namespace llvm {
+template <> struct GraphTraits<RematGraph *> {
+  using NodeRef = RematGraph::RematNode *;
+  using ChildIteratorType = RematGraph::RematNode **;
+
+  static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; }
+  static ChildIteratorType child_begin(NodeRef N) {
+    return N->Operands.begin();
+  }
+  static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }
+};
+
+} // end namespace llvm
+
+// For each instruction identified as materializable across the suspend point,
+// and its associated DAG of other rematerializable instructions,
+// recreate the DAG of instructions after the suspend point.
+static void rewriteMaterializableInstructions(
+    const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8>
+        &AllRemats) {
+  // This has to be done in 2 phases
+  // Do the remats and record the required defs to be replaced in the
+  // original use instructions
+  // Once all the remats are complete, replace the uses in the final
+  // instructions with the new defs
+  typedef struct {
+    Instruction *Use;
+    Instruction *Def;
+    Instruction *Remat;
+  } ProcessNode;
+
+  SmallVector<ProcessNode> FinalInstructionsToProcess;
+
+  for (const auto &E : AllRemats) {
+    Instruction *Use = E.first;
+    Instruction *CurrentMaterialization = nullptr;
+    RematGraph *RG = E.second.get();
+    ReversePostOrderTraversal<RematGraph *> RPOT(RG);
+    SmallVector<Instruction *> InstructionsToProcess;
+
+    // If the target use is actually a suspend instruction then we have to
+    // insert the remats into the end of the predecessor (there should only be
+    // one). This is so that suspend blocks always have the suspend instruction
+    // as the first instruction.
+    auto InsertPoint = &*Use->getParent()->getFirstInsertionPt();
+    if (isa<AnyCoroSuspendInst>(Use)) {
+      BasicBlock *SuspendPredecessorBlock =
+          Use->getParent()->getSinglePredecessor();
+      assert(SuspendPredecessorBlock && "malformed coro suspend instruction");
+      InsertPoint = SuspendPredecessorBlock->getTerminator();
+    }
+
+    // Note: skip the first instruction as this is the actual use that we're
+    // rematerializing everything for.
+    auto I = RPOT.begin();
+    ++I;
+    for (; I != RPOT.end(); ++I) {
+      Instruction *D = (*I)->Node;
+      CurrentMaterialization = D->clone();
+      CurrentMaterialization->setName(D->getName());
+      CurrentMaterialization->insertBefore(InsertPoint);
+      InsertPoint = CurrentMaterialization;
+
+      // Replace all uses of Def in the instructions being added as part of this
+      // rematerialization group
+      for (auto &I : InstructionsToProcess)
+        I->replaceUsesOfWith(D, CurrentMaterialization);
+
+      // Don't replace the final use at this point as this can cause problems
+      // for other materializations. Instead, for any final use that uses a
+      // define that's being rematerialized, record the replace values
+      for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i)
+        if (Use->getOperand(i) == D) // Is this operand pointing to oldval?
+          FinalInstructionsToProcess.push_back(
+              {Use, D, CurrentMaterialization});
+
+      InstructionsToProcess.push_back(CurrentMaterialization);
+    }
+  }
+
+  // Finally, replace the uses with the defines that we've just rematerialized
+  for (auto &R : FinalInstructionsToProcess) {
+    if (auto *PN = dyn_cast<PHINode>(R.Use)) {
+      assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
+                                                "values in the PHINode");
+      PN->replaceAllUsesWith(R.Remat);
+      PN->eraseFromParent();
+      continue;
+    }
+    R.Use->replaceUsesOfWith(R.Def, R.Remat);
+  }
+}
+
+/// Default materializable callback
+// Check for instructions that we can recreate on resume as opposed to spill
+// the result into a coroutine frame.
+bool llvm::coro::defaultMaterializable(Instruction &V) {
+  return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) ||
+          isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V));
+}
+
+bool llvm::coro::isTriviallyMaterializable(Instruction &V) {
+  return defaultMaterializable(V);
+}
+
+#ifndef NDEBUG
+static void dumpRemats(
+    StringRef Title,
+    const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) {
+  dbgs() << "------------- " << Title << "--------------\n";
+  for (const auto &E : RM) {
+    E.second->dump();
+    dbgs() << "--\n";
+  }
+}
+#endif
+
+void coro::doRematerializations(
+    Function &F, SuspendCrossingInfo &Checker,
+    std::function<bool(Instruction &)> IsMaterializable) {
+  if (F.hasOptNone())
+    return;
+
+  coro::SpillInfo Spills;
+
+  // See if there are materializable instructions across suspend points
+  // We record these as the starting point to also identify materializable
+  // defs of uses in these operations
+  for (Instruction &I : instructions(F)) {
+    if (!IsMaterializable(I))
+      continue;
+    for (User *U : I.users())
+      if (Checker.isDefinitionAcrossSuspend(I, U))
+        Spills[&I].push_back(cast<Instruction>(U));
+  }
+
+  // Process each of the identified rematerializable instructions
+  // and add predecessor instructions that can also be rematerialized.
+  // This is actually a graph of instructions since we could potentially
+  // have multiple uses of a def in the set of predecessor instructions.
+  // The approach here is to maintain a graph of instructions for each bottom
+  // level instruction - where we have a unique set of instructions (nodes)
+  // and edges between them. We then walk the graph in reverse post-dominator
+  // order to insert them past the suspend point, but ensure that ordering is
+  // correct. We also rely on CSE removing duplicate defs for remats of
+  // different instructions with a def in common (rather than maintaining more
+  // complex graphs for each suspend point)
+
+  // We can do this by adding new nodes to the list for each suspend
+  // point. Then using standard GraphTraits to give a reverse post-order
+  // traversal when we insert the nodes after the suspend
+  SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats;
+  for (auto &E : Spills) {
+    for (Instruction *U : E.second) {
+      // Don't process a user twice (this can happen if the instruction uses
+      // more than one rematerializable def)
+      if (AllRemats.count(U))
+        continue;
+
+      // Constructor creates the whole RematGraph for the given Use
+      auto RematUPtr =
+          std::make_unique<RematGraph>(IsMaterializable, U, Checker);
+
+      LLVM_DEBUG(dbgs() << "***** Next remat group *****\n";
+                 ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get());
+                 for (auto I = RPOT.begin(); I != RPOT.end();
+                      ++I) { (*I)->Node->dump(); } dbgs()
+                 << "\n";);
+
+      AllRemats[U] = std::move(RematUPtr);
+    }
+  }
+
+  // Rewrite materializable instructions to be materialized at the use
+  // point.
+  LLVM_DEBUG(dumpRemats("Materializations", AllRemats));
+  rewriteMaterializableInstructions(AllRemats);
+}
diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.h b/llvm/lib/Transforms/Coroutines/MaterializationUtils.h
new file mode 100644
index 0000000000000..f391851c97b3b
--- /dev/null
+++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.h
@@ -0,0 +1,30 @@
+//===- MaterializationUtils.h - Utilities for doing materialization -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SuspendCrossingInfo.h"
+#include "llvm/IR/Instruction.h"
+
+#ifndef LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
+#define LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
+
+namespace llvm {
+
+namespace coro {
+
+// True if I is trivially rematerialzable, e.g. InsertElementInst
+bool isTriviallyMaterializable(Instruction &I);
+
+// Performs rematerialization, invoked from buildCoroutineFrame.
+void doRematerializations(Function &F, SuspendCrossingInfo &Checker,
+                          std::function<bool(Instruction &)> IsMaterializable);
+
+} // namespace coro
+
+} // namespace llvm
+
+#endif // LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index d71b0a336d471..4c12e66f288db 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -23,17 +23,6 @@ namespace {
 
 typedef SmallPtrSet<BasicBlock *, 8> VisitedBlocksSet;
 
-static bool isSuspendBlock(BasicBlock *BB) {
-  return isa<AnyCoroSuspendInst>(BB->front());
-}
-
-// Check for structural coroutine intrinsics that should not be spilled into
-// the coroutine frame.
-static bool isCoroutineStructureIntrinsic(Instruction &I) {
-  return isa<CoroIdInst>(&I) || isa<CoroSaveInst>(&I) ||
-         isa<CoroSuspendInst>(&I);
-}
-
 /// Does control flow starting at the given block ever reach a suspend
 /// instruction before reaching a block in VisitedOrFreeBBs?
 static bool isSuspendReachableFrom(BasicBlock *From,
@@ -45,7 +34,7 @@ static bool isSuspendReachableFrom(BasicBlock *From,
     return false;
 
   // We assume that we'll already have split suspends into their own blocks.
-  if (isSuspendBlock(From))
+  if (coro::isSuspendBlock(From))
     return true;
 
   // Recurse on the successors.
@@ -448,6 +437,13 @@ static void collectFrameAlloca(AllocaInst *AI, const coro::Shape &Shape,
 
 } // namespace
 
+// Check for structural coroutine intrinsics that should not be spilled into
+// the coroutine frame.
+bool isCoroutineStructureIntrinsic(Instruction &I) {
+  return isa<CoroIdInst>(&I) || isa<CoroSaveInst>(&I) ||
+         isa<CoroSuspendInst>(&I);
+}
+
 void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
                            const SuspendCrossingInfo &Checker) {
   // Collect the spills for arguments and other not-materializable values.
@@ -626,6 +622,6 @@ BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
   return InsertPt;
 }
 
-} // End namespace coro.
+} // namespace coro
 
-} // End namespace llvm.
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.h b/llvm/lib/Transforms/Coroutines/SpillUtils.h
index de0ff0bcd3a4f..8843b611e0842 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.h
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.h
@@ -29,8 +29,6 @@ struct AllocaInfo {
         MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {}
 };
 
-bool isSuspendBlock(BasicBlock *BB);
-
 void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
                            const SuspendCrossingInfo &Checker);
 void collectSpillsAndAllocasFromInsts(
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 1aac8e0713587..ff0d78178bd18 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -46,7 +46,6 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <cassert>
 #include <memory>
-#include <set>
 #include <string>
 #include <system_error>
 #include <tuple>
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 5c09bb1800cb2..fa25baee2ba03 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -470,8 +470,20 @@ class CallsiteContextGraph {
 private:
   using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
 
-  using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
-                                     const FuncTy *, DenseSet<uint32_t>>;
+  // Structure to keep track of information for each call as we are matching
+  // non-allocation callsites onto context nodes created from the allocation
+  // call metadata / summary contexts.
+  struct CallContextInfo {
+    // The callsite we're trying to match.
+    CallTy Call;
+    // The callsites stack ids that have a context node in the graph.
+    std::vector<uint64_t> StackIds;
+    // The function containing this callsite.
+    const FuncTy *Func;
+    // Initially empty, if needed this will be updated to contain the context
+    // ids for use in a new context node created for this callsite.
+    DenseSet<uint32_t> ContextIds;
+  };
 
   /// Assigns the given Node to calls at or inlined into the location with
   /// the Node's stack id, after post order traversing and processing its
@@ -1458,7 +1470,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     auto &Calls = It.getSecond();
     // Skip single calls with a single stack id. These don't need a new node.
     if (Calls.size() == 1) {
-      auto &Ids = std::get<1>(Calls[0]);
+      auto &Ids = Calls[0].StackIds;
       if (Ids.size() == 1)
         continue;
     }
@@ -1474,18 +1486,15 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     // that to sort by.
     DenseMap<const FuncTy *, unsigned> FuncToIndex;
     for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
-      FuncToIndex.insert({std::get<2>(CallCtxInfo), Idx});
+      FuncToIndex.insert({CallCtxInfo.Func, Idx});
     std::stable_sort(
         Calls.begin(), Calls.end(),
         [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
-          auto &IdsA = std::get<1>(A);
-          auto &IdsB = std::get<1>(B);
-          auto *FuncA = std::get<2>(A);
-          auto *FuncB = std::get<2>(B);
-          return IdsA.size() > IdsB.size() ||
-                 (IdsA.size() == IdsB.size() &&
-                  (IdsA < IdsB ||
-                   (IdsA == IdsB && FuncToIndex[FuncA] < FuncToIndex[FuncB])));
+          return A.StackIds.size() > B.StackIds.size() ||
+                 (A.StackIds.size() == B.StackIds.size() &&
+                  (A.StackIds < B.StackIds ||
+                   (A.StackIds == B.StackIds &&
+                    FuncToIndex[A.Func] < FuncToIndex[B.Func])));
         });
 
     // Find the node for the last stack id, which should be the same
@@ -1520,7 +1529,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
 #ifndef NDEBUG
       // If this call has a different set of ids than the last one, clear the
       // set used to ensure they are sorted properly.
-      if (I > 0 && Ids != std::get<1>(Calls[I - 1]))
+      if (I > 0 && Ids != Calls[I - 1].StackIds)
         MatchingIdsFuncSet.clear();
       else
         // If the prior call had the same stack ids this set would not be empty.
@@ -1607,17 +1616,18 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
       // assigned to the same context node, and skip them.
       bool DuplicateContextIds = false;
       for (unsigned J = I + 1; J < Calls.size(); J++) {
-        auto &NextIds = std::get<1>(Calls[J]);
+        auto &CallCtxInfo = Calls[J];
+        auto &NextIds = CallCtxInfo.StackIds;
         if (NextIds != Ids)
           break;
-        auto *NextFunc = std::get<2>(Calls[J]);
+        auto *NextFunc = CallCtxInfo.Func;
         if (NextFunc != Func) {
           // We have another Call with the same ids but that cannot share this
           // node, must duplicate ids for it.
           DuplicateContextIds = true;
           break;
         }
-        auto &NextCall = std::get<0>(Calls[J]);
+        auto &NextCall = CallCtxInfo.Call;
         CallToMatchingCall[NextCall] = Call;
         // Update I so that it gets incremented correctly to skip this call.
         I = J;
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 3f15fa2163d27..d81665622809c 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1950,7 +1950,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
 /// during call. Try to use memcpy source directly if all of the following
 /// conditions are satisfied.
 /// 1. The memcpy dst is neither modified during the call nor captured by the
-/// call. (if readonly, noalias, nocapture attributes on call-site.)
+/// call.
 /// 2. The memcpy dst is an alloca with known alignment & size.
 ///     2-1. The memcpy length == the alloca size which ensures that the new
 ///     pointer is dereferenceable for the required range
@@ -1961,12 +1961,22 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
 /// 4. The memcpy src is not modified during the call. (ModRef check shows no
 /// Mod.)
 bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
+  BatchAAResults BAA(*AA);
+  Value *ImmutArg = CB.getArgOperand(ArgNo);
+
   // 1. Ensure passed argument is immutable during call.
-  if (!(CB.paramHasAttr(ArgNo, Attribute::NoAlias) &&
-        CB.paramHasAttr(ArgNo, Attribute::NoCapture)))
+  if (!CB.paramHasAttr(ArgNo, Attribute::NoCapture))
+    return false;
+
+  // We know that the argument is readonly at this point, but the function
+  // might still modify the same memory through a different pointer. Exclude
+  // this either via noalias, or alias analysis.
+  if (!CB.paramHasAttr(ArgNo, Attribute::NoAlias) &&
+      isModSet(
+          BAA.getModRefInfo(&CB, MemoryLocation::getBeforeOrAfter(ImmutArg))))
     return false;
+
   const DataLayout &DL = CB.getDataLayout();
-  Value *ImmutArg = CB.getArgOperand(ArgNo);
 
   // 2. Check that arg is alloca
   // TODO: Even if the arg gets back to branches, we can remove memcpy if all
@@ -1986,7 +1996,6 @@ bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
     return false;
 
   MemCpyInst *MDep = nullptr;
-  BatchAAResults BAA(*AA);
   MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
       CallAccess->getDefiningAccess(), Loc, BAA);
   if (auto *MD = dyn_cast<MemoryDef>(Clobber))
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index d0669e44f821b..c85c819263e2a 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1028,7 +1028,14 @@ CanRedirectPredsOfEmptyBBToSucc(BasicBlock *BB, BasicBlock *Succ,
   if (!BB->hasNPredecessorsOrMore(2))
     return false;
 
-  // Get single common predecessors of both BB and Succ
+  if (any_of(BBPreds, [](const BasicBlock *Pred) {
+        return isa<PHINode>(Pred->begin()) &&
+               isa<IndirectBrInst>(Pred->getTerminator());
+      }))
+    return false;
+
+  // Get the single common predecessor of both BB and Succ. Return false
+  // when there are more than one common predecessors.
   for (BasicBlock *SuccPred : SuccPreds) {
     if (BBPreds.count(SuccPred)) {
       if (CommonPred)
@@ -1133,7 +1140,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
 
   bool BBKillable = CanPropagatePredecessorsForPHIs(BB, Succ, BBPreds);
 
-  // Even if we can not fold bB into Succ, we may be able to redirect the
+  // Even if we can not fold BB into Succ, we may be able to redirect the
   // predecessors of BB to Succ.
   bool BBPhisMergeable =
       BBKillable ||
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2be3b57752925..b821da03c16e9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7314,9 +7314,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
   // Return true if the loop contains any instructions that are not also part of
   // the VPlan or are skipped for VPlan-based cost computations. This indicates
   // that the VPlan contains extra simplifications.
-  return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx](BasicBlock *BB) {
-    return any_of(*BB, [&SeenInstrs, &CostCtx](Instruction &I) {
-      if (isa<PHINode>(&I))
+  return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
+                                    TheLoop](BasicBlock *BB) {
+    return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
+      if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
         return false;
       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
     });
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
index de178cdf19308..29dce5f21173a 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,136 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 75 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 150 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 75 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 150 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 145 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
index 1f54b7485aa8f..0e7b1c58e587c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,179 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 23 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 105 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 105 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
index d53dca05155b7..8830aff579c32 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,30 +14,203 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 120 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 14 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 40 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 92 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX512:  LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
index 1575f92465d52..cfd3d7841caa2 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,35 +14,67 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 24 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 14 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 112 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 48 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
index 89175a65990f6..5ec5b51731385 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,32 +14,86 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 36 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 84 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
index 8db9fd364133e..450743df72325 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,107 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
index 25c49e3b8a811..5e5c718dba97d 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,116 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; SSE2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
 ; SSE2:  LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
index 42c980b6d3985..62541fa2368c6 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,137 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; SSE2:  LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
 ; SSE2:  LV: Found an estimated cost of 36 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 84 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
index 68afa6d17f02f..cfed8554b978b 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,158 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
 ; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
 ; SSE2:  LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
index 7894912c88fab..07939b914d022 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,27 +14,171 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
 ; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
 ; SSE2:  LV: Found an estimated cost of 48 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
index d8eaa0aad61d5..964a9b660942e 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,193 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
 ; SSE2:  LV: Found an estimated cost of 22 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
 ; SSE2:  LV: Found an estimated cost of 43 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; SSE2:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; SSE2:  LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX1:  LV: Found an estimated cost of 25 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX1:  LV: Found an estimated cost of 43 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX1:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX1:  LV: Found an estimated cost of 175 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX1:  LV: Found an estimated cost of 350 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX2:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX2:  LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX2:  LV: Found an estimated cost of 330 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 25 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 175 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 355 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 710 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 55 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 235 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
index 9c0d102a70d1e..6653198397dd2 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,265 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
 ; SSE2:  LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
 ; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
 ; SSE2:  LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
 ; SSE2:  LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX1:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX1:  LV: Found an estimated cost of 62 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX1:  LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX1:  LV: Found an estimated cost of 245 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX1:  LV: Found an estimated cost of 490 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX2:  LV: Found an estimated cost of 231 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX2:  LV: Found an estimated cost of 462 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 121 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 245 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 497 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 994 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 15 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 19 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 112 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 469 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
index 7654185635d3e..b3a5cbeccc09c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,301 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
 ; SSE2:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
 ; SSE2:  LV: Found an estimated cost of 68 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
 ; SSE2:  LV: Found an estimated cost of 136 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
 ; SSE2:  LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX1:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX1:  LV: Found an estimated cost of 68 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX1:  LV: Found an estimated cost of 136 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX1:  LV: Found an estimated cost of 280 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX1:  LV: Found an estimated cost of 560 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX2:  LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX2:  LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX2:  LV: Found an estimated cost of 528 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 68 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 136 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 280 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 568 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 1136 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i16, ptr %in7, align 2
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 64 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 148 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 616 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i16, ptr %in7, align 2
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
index 86590c0fa6a9c..c0ea210385dfd 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,35 +14,67 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
 ; SSE2:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
 ; SSE2:  LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; SSE2:  LV: Found an estimated cost of 60 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; SSE2:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX1:  LV: Found an estimated cost of 10 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX1:  LV: Found an estimated cost of 20 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX1:  LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX1:  LV: Found an estimated cost of 84 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX1:  LV: Found an estimated cost of 168 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 13 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
index 63901617bb9dd..2a261ca4de4fa 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,136 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 75 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 150 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 23 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 95 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 190 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 145 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
index 1eabac4e0b9c3..8bf3071d29fbe 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,179 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 49 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 105 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 210 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 133 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 266 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
index a1bb2efd73963..3182de2df058a 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,30 +14,203 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 120 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 240 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 36 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 152 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 304 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 14 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 40 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 92 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX512:  LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
index bd230166ebe78..27e2ee0392615 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,35 +14,67 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 22 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 44 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 88 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 176 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 48 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
index e03d3c2f8b3a4..c37723257c1f7 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,32 +14,86 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 84 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 33 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 66 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 132 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
index f7249666918dd..2eb7c5e93078f 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,107 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 44 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 88 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 176 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
index 96946bd58dea1..c11da4309737d 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,116 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; SSE2:  LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
 ; SSE2:  LV: Found an estimated cost of 70 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 25 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 55 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
index 2355c6e8b57a1..de57af6ebe398 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,137 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; SSE2:  LV: Found an estimated cost of 42 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
 ; SSE2:  LV: Found an estimated cost of 84 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 30 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 66 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 132 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
index 646003a41dcf5..949c1af1fdad3 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,158 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
 ; SSE2:  LV: Found an estimated cost of 49 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
 ; SSE2:  LV: Found an estimated cost of 98 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 77 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 154 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 126 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
index 568ab74068f94..4388ccfbdcfc4 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,27 +14,171 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
 ; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
 ; SSE2:  LV: Found an estimated cost of 112 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 40 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 88 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 176 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 144 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
index 6c1dd916311ab..6078fb440f9d1 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,193 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
 ; SSE2:  LV: Found an estimated cost of 38 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
 ; SSE2:  LV: Found an estimated cost of 75 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
 ; SSE2:  LV: Found an estimated cost of 155 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; SSE2:  LV: Found an estimated cost of 315 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX1:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX1:  LV: Found an estimated cost of 83 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX1:  LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX1:  LV: Found an estimated cost of 335 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX2:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX2:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX2:  LV: Found an estimated cost of 325 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 335 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 675 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 99 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 198 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 395 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
index 1ff3bc57a50d9..ed8bc84e771f8 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,229 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; SSE2:  LV: Found an estimated cost of 47 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
 ; SSE2:  LV: Found an estimated cost of 90 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
 ; SSE2:  LV: Found an estimated cost of 186 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; SSE2:  LV: Found an estimated cost of 378 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX1:  LV: Found an estimated cost of 27 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX1:  LV: Found an estimated cost of 52 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX1:  LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX1:  LV: Found an estimated cost of 198 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX1:  LV: Found an estimated cost of 402 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 46 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 88 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 45 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 85 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 810 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 25 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 49 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 119 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 237 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 591 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
index d77bca6b7aa5a..778a4e7dfd7d9 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,265 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
 ; SSE2:  LV: Found an estimated cost of 57 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
 ; SSE2:  LV: Found an estimated cost of 110 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
 ; SSE2:  LV: Found an estimated cost of 217 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
 ; SSE2:  LV: Found an estimated cost of 441 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX1:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX1:  LV: Found an estimated cost of 62 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX1:  LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX1:  LV: Found an estimated cost of 231 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX1:  LV: Found an estimated cost of 469 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX2:  LV: Found an estimated cost of 224 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX2:  LV: Found an estimated cost of 455 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 62 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 120 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 233 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 469 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 945 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 29 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 57 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 138 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 413 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 826 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
index 00ad2f68814b8..a230b5a0b1f2b 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,301 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
 ; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
 ; SSE2:  LV: Found an estimated cost of 120 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
 ; SSE2:  LV: Found an estimated cost of 248 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
 ; SSE2:  LV: Found an estimated cost of 504 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX1:  LV: Found an estimated cost of 33 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX1:  LV: Found an estimated cost of 66 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX1:  LV: Found an estimated cost of 132 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX1:  LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX1:  LV: Found an estimated cost of 536 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX2:  LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX2:  LV: Found an estimated cost of 256 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX2:  LV: Found an estimated cost of 520 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 33 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 66 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 132 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 536 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 1080 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i8, ptr %in7, align 1
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 33 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 65 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 158 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 472 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 1100 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i8, ptr %in7, align 1
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll
index 678bb8917bd0d..2ad37bee35bed 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll
@@ -14,12 +14,14 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
 ; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v7, ptr %out7, align 4
 ; SSE2:  LV: Found an estimated cost of 56 for VF 4 For instruction: store float %v7, ptr %out7, align 4
 ; SSE2:  LV: Found an estimated cost of 112 for VF 8 For instruction: store float %v7, ptr %out7, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
 ; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v7, ptr %out7, align 4
 ; AVX1:  LV: Found an estimated cost of 56 for VF 4 For instruction: store float %v7, ptr %out7, align 4
 ; AVX1:  LV: Found an estimated cost of 120 for VF 8 For instruction: store float %v7, ptr %out7, align 4
@@ -27,6 +29,7 @@ define void @test() {
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
 ; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v7, ptr %out7, align 4
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: store float %v7, ptr %out7, align 4
 ; AVX2:  LV: Found an estimated cost of 120 for VF 8 For instruction: store float %v7, ptr %out7, align 4
@@ -34,6 +37,7 @@ define void @test() {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
 ; AVX512:  LV: Found an estimated cost of 11 for VF 2 For instruction: store float %v7, ptr %out7, align 4
 ; AVX512:  LV: Found an estimated cost of 23 for VF 4 For instruction: store float %v7, ptr %out7, align 4
 ; AVX512:  LV: Found an estimated cost of 46 for VF 8 For instruction: store float %v7, ptr %out7, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
index 394d1d4de00f5..c1a66c1a41d74 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v7, ptr %out7"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v., ptr %out."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -13,27 +13,172 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @test() {
 ; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 48 for VF 4 For instruction: store double %v7, ptr %out7, align 8
 ;
 ; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 56 for VF 4 For instruction: store double %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 112 for VF 8 For instruction: store double %v7, ptr %out7, align 8
 ;
 ; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: store double %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: store double %v7, ptr %out7, align 8
 ;
 ; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 23 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 46 for VF 4 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: store double %v7, ptr %out7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
index 4d9aad54b0c8f..7be9577960efe 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v7, ptr %out7"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v., ptr %out."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -13,27 +13,172 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @test() {
 ; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 112 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
 ;
 ; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 40 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 88 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 176 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
 ;
 ; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 40 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 88 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 176 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
 ;
 ; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 23 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 46 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
index 741dd0746b744..13a844230f89d 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
@@ -22,6 +22,8 @@ define void @test1(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; DISABLED_MASKED_STRIDED-LABEL: 'test1'
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 13 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
@@ -34,6 +36,8 @@ define void @test1(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED-LABEL: 'test1'
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 12 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
@@ -79,6 +83,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no
 ; DISABLED_MASKED_STRIDED-LABEL: 'test2'
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 10 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
@@ -91,6 +97,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no
 ; ENABLED_MASKED_STRIDED-LABEL: 'test2'
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
@@ -145,6 +153,7 @@ for.end:
 define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly %x, ptr noalias nocapture readnone %y) {
 ; DISABLED_MASKED_STRIDED-LABEL: 'test'
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2
@@ -152,6 +161,7 @@ define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: 'test'
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir
new file mode 100644
index 0000000000000..f207e9c149a47
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir
@@ -0,0 +1,136 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs  %s | FileCheck %s --check-prefixes=CHECK
+
+---
+name:            test_combine_trunc_xor_i64
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_xor_i64
+    ; CHECK: %lhs:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_XOR [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = G_CONSTANT i64 5
+    %res:_(s64) = G_XOR %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_add_i64
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_add_i64
+    ; CHECK: %lhs:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_ADD [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = G_CONSTANT i64 5
+    %res:_(s64) = G_ADD %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_mul_i64
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_mul_i64
+    ; CHECK: %lhs:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_MUL [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = G_CONSTANT i64 5
+    %res:_(s64) = G_MUL %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_and_i64
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_and_i64
+    ; CHECK: %lhs:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = G_CONSTANT i64 5
+    %res:_(s64) = G_AND %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_or_i64
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_or_i64
+    ; CHECK: %lhs:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_OR [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = G_CONSTANT i64 5
+    %res:_(s64) = G_OR %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_sub_i128
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_sub_i128
+    ; CHECK: %lhs:_(s128) = COPY $q0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s128)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_SUB [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s128) = COPY $q0
+    %rhs:_(s128) = G_CONSTANT i128 5
+    %res:_(s128) = G_SUB %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s128)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_sub_i128_multi_use
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_sub_i128_multi_use
+    ; CHECK: %lhs:_(s128) = COPY $q0
+    ; CHECK-NEXT: %rhs:_(s128) = G_CONSTANT i128 5
+    ; CHECK-NEXT: %res:_(s128) = G_SUB %lhs, %rhs
+    ; CHECK-NEXT: %small:_(s32) = G_TRUNC %res(s128)
+    ; CHECK-NEXT: $q0 = COPY %res(s128)
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s128) = COPY $q0
+    %rhs:_(s128) = G_CONSTANT i128 5
+    %res:_(s128) = G_SUB %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s128)
+    $q0 = COPY %res(s128)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_xor_vector_pattern_did_not_match
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_xor_vector_pattern_did_not_match
+    ; CHECK: %arg1:_(s64) = COPY $x0
+    ; CHECK-NEXT: %arg2:_(s64) = COPY $x0
+    ; CHECK-NEXT: %lhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    ; CHECK-NEXT: %rhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    ; CHECK-NEXT: %res:_(<2 x s64>) = G_XOR %lhs, %rhs
+    ; CHECK-NEXT: %small:_(<2 x s16>) = G_TRUNC %res(<2 x s64>)
+    ; CHECK-NEXT: $w0 = COPY %small(<2 x s16>)
+    %arg1:_(s64) = COPY $x0
+    %arg2:_(s64) = COPY $x0
+    %lhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    %rhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    %res:_(<2 x s64>) = G_XOR %lhs, %rhs
+    %small:_(<2 x s16>) = G_TRUNC %res(<2 x s64>)
+    $w0 = COPY %small(<2 x s16>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
index fee5afd3ddbb2..9ed1e2d9eee3b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
@@ -224,10 +224,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; CHECK-NEXT: G_STORE [[C]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[PTR_ADD1]](p0) :: (store (s16) into %ir.dst + 16, align 1)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 16448
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; CHECK-NEXT: G_STORE [[C2]](s16), [[PTR_ADD1]](p0) :: (store (s16) into %ir.dst + 16, align 1)
     ; CHECK-NEXT: RET_ReallyLR
     %0:_(p0) = COPY $x0
     %1:_(s8) = G_CONSTANT i8 64
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
index e51d9bd13163b..a87ff305d1535 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
@@ -8,9 +8,8 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     ; CHECK-LABEL: name: test
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
-    ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: $w0 = COPY [[C]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(s16) = G_CONSTANT i16 0
     %2:_(s1) = G_CONSTANT i1 true
@@ -41,9 +40,7 @@ body:             |
   bb.1:
     ; CHECK-LABEL: name: test_inverted_div_rem
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
-    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s8)
-    ; CHECK-NEXT: $w0 = COPY [[SEXT]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[C]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(s16) = G_CONSTANT i16 0
     %2:_(s1) = G_CONSTANT i1 true
diff --git a/llvm/test/CodeGen/AArch64/abds.ll b/llvm/test/CodeGen/AArch64/abds.ll
index 0e35f8240848b..e5cc04f9be1a1 100644
--- a/llvm/test/CodeGen/AArch64/abds.ll
+++ b/llvm/test/CodeGen/AArch64/abds.ll
@@ -539,6 +539,90 @@ define i64 @vector_legalized(i16 %a, i16 %b) {
   ret i64 %z
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: abd_select_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    cmp w8, w1, sxtb
+; CHECK-NEXT:    csel w8, w0, w1, lt
+; CHECK-NEXT:    csel w9, w1, w0, lt
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; CHECK-LABEL: abd_select_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sxth w8, w0
+; CHECK-NEXT:    cmp w8, w1, sxth
+; CHECK-NEXT:    csel w8, w0, w1, le
+; CHECK-NEXT:    csel w9, w1, w0, le
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: abd_select_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    csel w8, w0, w1, gt
+; CHECK-NEXT:    csel w9, w1, w0, gt
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: abd_select_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x1
+; CHECK-NEXT:    csel x8, x0, x1, ge
+; CHECK-NEXT:    csel x9, x1, x0, ge
+; CHECK-NEXT:    sub x0, x8, x9
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; CHECK-LABEL: abd_select_i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    sbcs xzr, x1, x3
+; CHECK-NEXT:    csel x8, x0, x2, lt
+; CHECK-NEXT:    csel x9, x2, x0, lt
+; CHECK-NEXT:    csel x10, x1, x3, lt
+; CHECK-NEXT:    csel x11, x3, x1, lt
+; CHECK-NEXT:    subs x0, x9, x8
+; CHECK-NEXT:    sbc x1, x11, x10
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
 
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
diff --git a/llvm/test/CodeGen/AArch64/abdu.ll b/llvm/test/CodeGen/AArch64/abdu.ll
index eb866e6a78a9b..0a44ae1688458 100644
--- a/llvm/test/CodeGen/AArch64/abdu.ll
+++ b/llvm/test/CodeGen/AArch64/abdu.ll
@@ -400,6 +400,91 @@ define i64 @vector_legalized(i16 %a, i16 %b) {
   ret i64 %z
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abdu(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: abd_select_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    cmp w8, w1, uxtb
+; CHECK-NEXT:    csel w8, w0, w1, lo
+; CHECK-NEXT:    csel w9, w1, w0, lo
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; CHECK-LABEL: abd_select_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    cmp w8, w1, uxth
+; CHECK-NEXT:    csel w8, w0, w1, ls
+; CHECK-NEXT:    csel w9, w1, w0, ls
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: abd_select_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    csel w8, w0, w1, hi
+; CHECK-NEXT:    csel w9, w1, w0, hi
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: abd_select_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x1
+; CHECK-NEXT:    csel x8, x0, x1, hs
+; CHECK-NEXT:    csel x9, x1, x0, hs
+; CHECK-NEXT:    sub x0, x8, x9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; CHECK-LABEL: abd_select_i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    sbcs xzr, x1, x3
+; CHECK-NEXT:    csel x8, x0, x2, lo
+; CHECK-NEXT:    csel x9, x2, x0, lo
+; CHECK-NEXT:    csel x10, x1, x3, lo
+; CHECK-NEXT:    csel x11, x3, x1, lo
+; CHECK-NEXT:    subs x0, x9, x8
+; CHECK-NEXT:    sbc x1, x11, x10
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
+
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
 declare i32 @llvm.abs.i32(i32, i1)
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll b/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll
index 08bceb850df40..a3efa6b961e63 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll
@@ -1,134 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @v_orrimm(ptr %A) nounwind {
 ; CHECK-LABEL: v_orrimm:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: orr
-	%tmp1 = load <8 x i8>, ptr %A
-	%tmp3 = or <8 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
-	ret <8 x i8> %tmp3
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    orr.2s v0, #1, lsl #24
+; CHECK-NEXT:    ret
+  %tmp1 = load <8 x i8>, ptr %A
+  %tmp3 = or <8 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+  ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @v_orrimmQ(ptr %A) nounwind {
-; CHECK: v_orrimmQ
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: orr
-	%tmp1 = load <16 x i8>, ptr %A
-	%tmp3 = or <16 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
-	ret <16 x i8> %tmp3
+; CHECK-LABEL: v_orrimmQ:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    orr.4s v0, #1, lsl #24
+; CHECK-NEXT:    ret
+  %tmp1 = load <16 x i8>, ptr %A
+  %tmp3 = or <16 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+  ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @v_bicimm(ptr %A) nounwind {
 ; CHECK-LABEL: v_bicimm:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: bic
-	%tmp1 = load <8 x i8>, ptr %A
-	%tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
-	ret <8 x i8> %tmp3
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    bic.2s v0, #255, lsl #24
+; CHECK-NEXT:    ret
+  %tmp1 = load <8 x i8>, ptr %A
+  %tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+  ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @v_bicimmQ(ptr %A) nounwind {
 ; CHECK-LABEL: v_bicimmQ:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: bic
-	%tmp1 = load <16 x i8>, ptr %A
-	%tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
-	ret <16 x i8> %tmp3
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    bic.4s v0, #255, lsl #24
+; CHECK-NEXT:    ret
+  %tmp1 = load <16 x i8>, ptr %A
+  %tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+  ret <16 x i8> %tmp3
 }
 
 define <2 x double> @foo(<2 x double> %bar) nounwind {
-; CHECK: foo
-; CHECK: fmov.2d	v1, #1.0000000
+; CHECK-LABEL: foo:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov.2d v1, #1.00000000
+; CHECK-NEXT:    fadd.2d v0, v0, v1
+; CHECK-NEXT:    ret
   %add = fadd <2 x double> %bar, <double 1.0, double 1.0>
   ret <2 x double> %add
 }
 
 define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t1:
-; CHECK: movi.4s v0, #75
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75
+; CHECK-NEXT:    ret
+entry:
   ret <4 x i32> <i32 75, i32 75, i32 75, i32 75>
 }
 
 define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t2:
-; CHECK: movi.4s v0, #75, lsl #8
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75, lsl #8
+; CHECK-NEXT:    ret
+entry:
   ret <4 x i32> <i32 19200, i32 19200, i32 19200, i32 19200>
 }
 
 define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t3:
-; CHECK: movi.4s v0, #75, lsl #16
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75, lsl #16
+; CHECK-NEXT:    ret
+entry:
   ret <4 x i32> <i32 4915200, i32 4915200, i32 4915200, i32 4915200>
 }
 
 define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t4:
-; CHECK: movi.4s v0, #75, lsl #24
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75, lsl #24
+; CHECK-NEXT:    ret
+entry:
   ret <4 x i32> <i32 1258291200, i32 1258291200, i32 1258291200, i32 1258291200>
 }
 
 define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_8h_imm_t5:
-; CHECK: movi.8h v0, #75
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.8h v0, #75
+; CHECK-NEXT:    ret
+entry:
   ret <8 x i16> <i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75>
 }
 
 ; rdar://11989841
 define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_8h_imm_t6:
-; CHECK: movi.8h v0, #75, lsl #8
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.8h v0, #75, lsl #8
+; CHECK-NEXT:    ret
+entry:
   ret <8 x i16> <i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200>
 }
 
 define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t7:
-; CHECK: movi.4s v0, #75, msl #8
-ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75, msl #8
+; CHECK-NEXT:    ret
+entry:
+  ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
 }
 
 define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t8:
-; CHECK: movi.4s v0, #75, msl #16
-ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75, msl #16
+; CHECK-NEXT:    ret
+entry:
+  ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
 }
 
 define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_16b_imm_t9:
-; CHECK: movi.16b v0, #75
-ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75,
-               i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.16b v0, #75
+; CHECK-NEXT:    ret
+entry:
+  ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
 }
 
 define <2 x i64> @movi_2d_imm_t10() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_2d_imm_t10:
-; CHECK: movi.2d v0, #0xff00ff00ff00ff
-ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.2d v0, #0xff00ff00ff00ff
+; CHECK-NEXT:    ret
+entry:
+  ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
 }
 
 define <4 x i32> @movi_4s_imm_t11() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t11:
-; CHECK: fmov.4s v0, #-0.32812500
-ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov.4s v0, #-0.32812500
+; CHECK-NEXT:    ret
+entry:
+  ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
 }
 
 define <2 x i64> @movi_2d_imm_t12() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_2d_imm_t12:
-; CHECK: fmov.2d v0, #-0.17187500
-ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov.2d v0, #-0.17187500
+; CHECK-NEXT:    ret
+entry:
+  ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
 }
diff --git a/llvm/test/CodeGen/AArch64/load-insert-undef.ll b/llvm/test/CodeGen/AArch64/load-insert-undef.ll
new file mode 100644
index 0000000000000..1e776d1c06fcb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/load-insert-undef.ll
@@ -0,0 +1,1098 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+bf16,+sve | FileCheck %s
+
+define <8 x i8> @loadv8i8(ptr %p) {
+; CHECK-LABEL: loadv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i8, ptr %p
+  %v = insertelement <8 x i8> poison, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8(ptr %p) {
+; CHECK-LABEL: loadv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i8, ptr %p
+  %v = insertelement <16 x i8> poison, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16(ptr %p) {
+; CHECK-LABEL: loadv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i16, ptr %p
+  %v = insertelement <4 x i16> poison, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16(ptr %p) {
+; CHECK-LABEL: loadv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i16, ptr %p
+  %v = insertelement <8 x i16> poison, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32(ptr %p) {
+; CHECK-LABEL: loadv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i32, ptr %p
+  %v = insertelement <2 x i32> poison, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32(ptr %p) {
+; CHECK-LABEL: loadv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i32, ptr %p
+  %v = insertelement <4 x i32> poison, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64(ptr %p) {
+; CHECK-LABEL: loadv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i64, ptr %p
+  %v = insertelement <2 x i64> poison, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+
+define <4 x half> @loadv4f16(ptr %p) {
+; CHECK-LABEL: loadv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load half, ptr %p
+  %v = insertelement <4 x half> poison, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16(ptr %p) {
+; CHECK-LABEL: loadv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load half, ptr %p
+  %v = insertelement <8 x half> poison, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16(ptr %p) {
+; CHECK-LABEL: loadv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load bfloat, ptr %p
+  %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16(ptr %p) {
+; CHECK-LABEL: loadv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load bfloat, ptr %p
+  %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32(ptr %p) {
+; CHECK-LABEL: loadv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load float, ptr %p
+  %v = insertelement <2 x float> poison, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32(ptr %p) {
+; CHECK-LABEL: loadv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load float, ptr %p
+  %v = insertelement <4 x float> poison, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64(ptr %p) {
+; CHECK-LABEL: loadv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %l = load double, ptr %p
+  %v = insertelement <2 x double> poison, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
+; Unscaled
+
+define <8 x i8> @loadv8i8_offset(ptr %p) {
+; CHECK-LABEL: loadv8i8_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> poison, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_offset(ptr %p) {
+; CHECK-LABEL: loadv16i8_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> poison, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_offset(ptr %p) {
+; CHECK-LABEL: loadv4i16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> poison, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_offset(ptr %p) {
+; CHECK-LABEL: loadv8i16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> poison, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_offset(ptr %p) {
+; CHECK-LABEL: loadv2i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> poison, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_offset(ptr %p) {
+; CHECK-LABEL: loadv4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> poison, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_offset(ptr %p) {
+; CHECK-LABEL: loadv2i64_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur x8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> poison, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+
+define <4 x half> @loadv4f16_offset(ptr %p) {
+; CHECK-LABEL: loadv4f16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> poison, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_offset(ptr %p) {
+; CHECK-LABEL: loadv8f16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> poison, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_offset(ptr %p) {
+; CHECK-LABEL: loadv4bf16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_offset(ptr %p) {
+; CHECK-LABEL: loadv8bf16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_offset(ptr %p) {
+; CHECK-LABEL: loadv2f32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> poison, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_offset(ptr %p) {
+; CHECK-LABEL: loadv4f32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> poison, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_offset(ptr %p) {
+; CHECK-LABEL: loadv2f64_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> poison, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
+define <8 x i8> @loadv8i8_noffset(ptr %p) {
+; CHECK-LABEL: loadv8i8_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurb w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> poison, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_noffset(ptr %p) {
+; CHECK-LABEL: loadv16i8_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurb w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> poison, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_noffset(ptr %p) {
+; CHECK-LABEL: loadv4i16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> poison, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_noffset(ptr %p) {
+; CHECK-LABEL: loadv8i16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> poison, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_noffset(ptr %p) {
+; CHECK-LABEL: loadv2i32_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> poison, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_noffset(ptr %p) {
+; CHECK-LABEL: loadv4i32_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> poison, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_noffset(ptr %p) {
+; CHECK-LABEL: loadv2i64_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur x8, [x0, #-1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> poison, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+
+define <4 x half> @loadv4f16_noffset(ptr %p) {
+; CHECK-LABEL: loadv4f16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> poison, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_noffset(ptr %p) {
+; CHECK-LABEL: loadv8f16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> poison, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_noffset(ptr %p) {
+; CHECK-LABEL: loadv4bf16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_noffset(ptr %p) {
+; CHECK-LABEL: loadv8bf16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_noffset(ptr %p) {
+; CHECK-LABEL: loadv2f32_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> poison, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_noffset(ptr %p) {
+; CHECK-LABEL: loadv4f32_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> poison, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_noffset(ptr %p) {
+; CHECK-LABEL: loadv2f64_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> poison, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
+; ROW addressing modes
+
+define <8 x i8> @loadv8i8_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8i8_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, w1, sxtw]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i32 %o
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> poison, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv16i8_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, w1, sxtw]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i32 %o
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> poison, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4i16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i32 %o
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> poison, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8i16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i32 %o
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> poison, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2i32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i32 %o
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> poison, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4i32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i32 %o
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> poison, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2i64_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, w1, sxtw #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i32 %o
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> poison, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+define <4 x half> @loadv4f16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4f16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i32 %o
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> poison, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8f16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i32 %o
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> poison, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4bf16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i32 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8bf16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i32 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2f32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i32 %o
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> poison, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4f32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i32 %o
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> poison, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2f64_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, w1, sxtw #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds double, ptr %p, i32 %o
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> poison, double %l, i32 0
+  ret <2 x double> %v
+}
+
+; roX
+
+define <8 x i8> @loadv8i8_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8i8_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 %o
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> poison, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv16i8_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 %o
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> poison, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4i16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i64 %o
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> poison, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8i16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i64 %o
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> poison, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2i32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i64 %o
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> poison, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4i32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i64 %o
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> poison, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2i64_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i64 %o
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> poison, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+define <4 x half> @loadv4f16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4f16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i64 %o
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> poison, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8f16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i64 %o
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> poison, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4bf16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i64 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8bf16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i64 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2f32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i64 %o
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> poison, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4f32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i64 %o
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> poison, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2f64_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds double, ptr %p, i64 %o
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> poison, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
+; SVE
+
+define <vscale x 8 x i8> @loadnxv8i8(ptr %p) {
+; CHECK-LABEL: loadnxv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %l = load i8, ptr %p
+  %v = insertelement <vscale x 8 x i8> poison, i8 %l, i32 0
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 16 x i8> @loadnxv16i8(ptr %p) {
+; CHECK-LABEL: loadnxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %l = load i8, ptr %p
+  %v = insertelement <vscale x 16 x i8> poison, i8 %l, i32 0
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 4 x i16> @loadnxv4i16(ptr %p) {
+; CHECK-LABEL: loadnxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %l = load i16, ptr %p
+  %v = insertelement <vscale x 4 x i16> poison, i16 %l, i32 0
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 8 x i16> @loadnxv8i16(ptr %p) {
+; CHECK-LABEL: loadnxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %l = load i16, ptr %p
+  %v = insertelement <vscale x 8 x i16> poison, i16 %l, i32 0
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 2 x i32> @loadnxv2i32(ptr %p) {
+; CHECK-LABEL: loadnxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %l = load i32, ptr %p
+  %v = insertelement <vscale x 2 x i32> poison, i32 %l, i32 0
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 4 x i32> @loadnxv4i32(ptr %p) {
+; CHECK-LABEL: loadnxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %l = load i32, ptr %p
+  %v = insertelement <vscale x 4 x i32> poison, i32 %l, i32 0
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 2 x i64> @loadnxv2i64(ptr %p) {
+; CHECK-LABEL: loadnxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %l = load i64, ptr %p
+  %v = insertelement <vscale x 2 x i64> poison, i64 %l, i32 0
+  ret <vscale x 2 x i64> %v
+}
+
+
+define <vscale x 4 x half> @loadnxv4f16(ptr %p) {
+; CHECK-LABEL: loadnxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load half, ptr %p
+  %v = insertelement <vscale x 4 x half> poison, half %l, i32 0
+  ret <vscale x 4 x half> %v
+}
+
+define <vscale x 8 x half> @loadnxv8f16(ptr %p) {
+; CHECK-LABEL: loadnxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load half, ptr %p
+  %v = insertelement <vscale x 8 x half> poison, half %l, i32 0
+  ret <vscale x 8 x half> %v
+}
+
+define <vscale x 4 x bfloat> @loadnxv4bf16(ptr %p) {
+; CHECK-LABEL: loadnxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load bfloat, ptr %p
+  %v = insertelement <vscale x 4 x bfloat> poison, bfloat %l, i32 0
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @loadnxv8bf16(ptr %p) {
+; CHECK-LABEL: loadnxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load bfloat, ptr %p
+  %v = insertelement <vscale x 8 x bfloat> poison, bfloat %l, i32 0
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 2 x float> @loadnxv2f32(ptr %p) {
+; CHECK-LABEL: loadnxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load float, ptr %p
+  %v = insertelement <vscale x 2 x float> poison, float %l, i32 0
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 4 x float> @loadnxv4f32(ptr %p) {
+; CHECK-LABEL: loadnxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load float, ptr %p
+  %v = insertelement <vscale x 4 x float> poison, float %l, i32 0
+  ret <vscale x 4 x float> %v
+}
+
+define <vscale x 2 x double> @loadnxv2f64(ptr %p) {
+; CHECK-LABEL: loadnxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %l = load double, ptr %p
+  %v = insertelement <vscale x 2 x double> poison, double %l, i32 0
+  ret <vscale x 2 x double> %v
+}
+
+
+; Unscaled
+
+define <vscale x 8 x i8> @loadnxv8i8_offset(ptr %p) {
+; CHECK-LABEL: loadnxv8i8_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i8, ptr %g
+  %v = insertelement <vscale x 8 x i8> poison, i8 %l, i32 0
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 16 x i8> @loadnxv16i8_offset(ptr %p) {
+; CHECK-LABEL: loadnxv16i8_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i8, ptr %g
+  %v = insertelement <vscale x 16 x i8> poison, i8 %l, i32 0
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 4 x i16> @loadnxv4i16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv4i16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i16, ptr %g
+  %v = insertelement <vscale x 4 x i16> poison, i16 %l, i32 0
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 8 x i16> @loadnxv8i16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv8i16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i16, ptr %g
+  %v = insertelement <vscale x 8 x i16> poison, i16 %l, i32 0
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 2 x i32> @loadnxv2i32_offset(ptr %p) {
+; CHECK-LABEL: loadnxv2i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i32, ptr %g
+  %v = insertelement <vscale x 2 x i32> poison, i32 %l, i32 0
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 4 x i32> @loadnxv4i32_offset(ptr %p) {
+; CHECK-LABEL: loadnxv4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i32, ptr %g
+  %v = insertelement <vscale x 4 x i32> poison, i32 %l, i32 0
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 2 x i64> @loadnxv2i64_offset(ptr %p) {
+; CHECK-LABEL: loadnxv2i64_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur x8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i64, ptr %g
+  %v = insertelement <vscale x 2 x i64> poison, i64 %l, i32 0
+  ret <vscale x 2 x i64> %v
+}
+
+
+define <vscale x 4 x half> @loadnxv4f16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv4f16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load half, ptr %g
+  %v = insertelement <vscale x 4 x half> poison, half %l, i32 0
+  ret <vscale x 4 x half> %v
+}
+
+define <vscale x 8 x half> @loadnxv8f16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv8f16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load half, ptr %g
+  %v = insertelement <vscale x 8 x half> poison, half %l, i32 0
+  ret <vscale x 8 x half> %v
+}
+
+define <vscale x 4 x bfloat> @loadnxv4bf16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv4bf16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load bfloat, ptr %g
+  %v = insertelement <vscale x 4 x bfloat> poison, bfloat %l, i32 0
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @loadnxv8bf16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv8bf16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load bfloat, ptr %g
+  %v = insertelement <vscale x 8 x bfloat> poison, bfloat %l, i32 0
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 2 x float> @loadnxv2f32_offset(ptr %p) {
+; CHECK-LABEL: loadnxv2f32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load float, ptr %g
+  %v = insertelement <vscale x 2 x float> poison, float %l, i32 0
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 4 x float> @loadnxv4f32_offset(ptr %p) {
+; CHECK-LABEL: loadnxv4f32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load float, ptr %g
+  %v = insertelement <vscale x 4 x float> poison, float %l, i32 0
+  ret <vscale x 4 x float> %v
+}
+
+define <vscale x 2 x double> @loadnxv2f64_offset(ptr %p) {
+; CHECK-LABEL: loadnxv2f64_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load double, ptr %g
+  %v = insertelement <vscale x 2 x double> poison, double %l, i32 0
+  ret <vscale x 2 x double> %v
+}
diff --git a/llvm/test/CodeGen/AArch64/load-insert-zero.ll b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
index 23d545459295f..ccbd6f03fbcc3 100644
--- a/llvm/test/CodeGen/AArch64/load-insert-zero.ll
+++ b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
@@ -378,7 +378,6 @@ define <2 x i64> @loadv2i64_noffset(ptr %p) {
   ret <2 x i64> %v
 }
 
-
 define <4 x half> @loadv4f16_noffset(ptr %p) {
 ; CHECK-LABEL: loadv4f16_noffset:
 ; CHECK:       // %bb.0:
@@ -457,6 +456,328 @@ define <2 x double> @loadv2f64_noffset(ptr %p) {
 }
 
 
+; ROW addressing modes
+
+define <8 x i8> @loadv8i8_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8i8_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, w1, sxtw]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i32 %o
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv16i8_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, w1, sxtw]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i32 %o
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4i16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i32 %o
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8i16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i32 %o
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2i32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, w1, sxtw #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i32 %o
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4i32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, w1, sxtw #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i32 %o
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2i64_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, w1, sxtw #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i32 %o
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+define <4 x half> @loadv4f16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4f16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i32 %o
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8f16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i32 %o
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4bf16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i32 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8bf16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i32 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2f32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, w1, sxtw #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i32 %o
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4f32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, w1, sxtw #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i32 %o
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2f64_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, w1, sxtw #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds double, ptr %p, i32 %o
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
+; roX
+
+define <8 x i8> @loadv8i8_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8i8_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 %o
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv16i8_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 %o
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4i16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i64 %o
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8i16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i64 %o
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2i32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, x1, lsl #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i64 %o
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4i32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, x1, lsl #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i64 %o
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2i64_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i64 %o
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+define <4 x half> @loadv4f16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4f16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i64 %o
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8f16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i64 %o
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4bf16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i64 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8bf16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i64 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2f32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, x1, lsl #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i64 %o
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4f32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, x1, lsl #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i64 %o
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2f64_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds double, ptr %p, i64 %o
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
 define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) {
 ; CHECK-LABEL: predictor_4x4_neon:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
index e4f11dfa9e027..d6135d86022be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
@@ -193,10 +193,10 @@ body:             |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_CONSTANT i64 255
@@ -216,10 +216,10 @@ body:             |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_CONSTANT i64 255
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
index 3b914df7f8f8a..3423af64162e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
@@ -12,9 +12,11 @@ body: |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %var:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: %c3FFF:_(s32) = G_CONSTANT i32 16383
-    ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %c3FFF
-    ; GCN-NEXT: $vgpr0 = COPY %low_bits(s32)
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32)
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383
+    ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]]
+    ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %zext(s32)
     %var:_(s32) = COPY $vgpr0
     %c3FFF:_(s32) = G_CONSTANT i32 16383
     %low_bits:_(s32) = G_AND %var, %c3FFF
@@ -34,10 +36,8 @@ body: |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %var:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: %cFFFFF:_(s32) = G_CONSTANT i32 1048575
-    ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %cFFFFF
-    ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s32)
-    ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16)
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32)
+    ; GCN-NEXT: %zext:_(s32) = G_ZEXT [[TRUNC]](s16)
     ; GCN-NEXT: $vgpr0 = COPY %zext(s32)
     %var:_(s32) = COPY $vgpr0
     %cFFFFF:_(s32) = G_CONSTANT i32 1048575
@@ -58,9 +58,9 @@ body: |
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %var:_(s64) = COPY $vgpr0_vgpr1
-    ; GCN-NEXT: %c3FFF:_(s64) = G_CONSTANT i64 16383
-    ; GCN-NEXT: %low_bits:_(s64) = G_AND %var, %c3FFF
-    ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s64)
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s64)
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383
+    ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16)
     ; GCN-NEXT: $vgpr0 = COPY %zext(s32)
     %var:_(s64) = COPY $vgpr0_vgpr1
@@ -82,9 +82,9 @@ body: |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %var:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: %c3FFF:_(s32) = G_CONSTANT i32 16383
-    ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %c3FFF
-    ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s32)
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32)
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383
+    ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GCN-NEXT: %zext:_(s64) = G_ZEXT %trunc(s16)
     ; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(s64)
     %var:_(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 966a481b6594d..bb7bc0447aea0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -238,13 +238,12 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
 ; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[12:13], 0x0
 ; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v1, -1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
-; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    ; implicit-def: $vgpr3
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index afffebea451a0..3bd3486ec261d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -350,10 +350,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX8-LABEL: s_fshl_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s3, s2, 7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
@@ -362,10 +364,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX9-LABEL: s_fshl_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s3, s2, 7
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
@@ -377,7 +381,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX10-NEXT:    s_and_b32 s3, s2, 7
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -389,7 +395,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX11-NEXT:    s_and_b32 s3, s2, 7
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -416,11 +424,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v3, 7, v2
-; GFX8-NEXT:    v_not_b32_e32 v2, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 1
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -429,11 +437,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 7, v2
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -441,11 +449,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX10-LABEL: v_fshl_i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_not_b32_e32 v3, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -454,12 +462,12 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX11-LABEL: v_fshl_i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_not_b32_e32 v3, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v2, v0
 ; GFX11-NEXT:    v_lshrrev_b16 v1, v3, v1
@@ -692,22 +700,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX8-NEXT:    s_and_b32 s6, s2, 7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s5, 7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s2, s4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX8-NEXT:    s_and_b32 s3, s4, 0xff
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_andn2_b32 s3, 7, s5
+; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_andn2_b32 s2, 7, s5
-; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
@@ -719,22 +731,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX9-NEXT:    s_and_b32 s6, s2, 7
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s5, 7
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_and_b32 s2, s4, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX9-NEXT:    s_and_b32 s3, s4, 0xff
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_andn2_b32 s3, 7, s5
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_andn2_b32 s2, 7, s5
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
@@ -745,21 +761,25 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX10-LABEL: s_fshl_v2i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX10-NEXT:    s_and_b32 s5, s2, 7
+; GFX10-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX10-NEXT:    s_and_b32 s6, s2, 7
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX10-NEXT:    s_and_b32 s5, s6, 7
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_andn2_b32 s6, 7, s6
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX10-NEXT:    s_and_b32 s6, s5, 7
-; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
-; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX10-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s5
+; GFX10-NEXT:    s_lshr_b32 s4, s4, s6
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX10-NEXT:    s_or_b32 s2, s3, s4
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -772,21 +792,25 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX11-LABEL: s_fshl_v2i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX11-NEXT:    s_and_b32 s5, s2, 7
+; GFX11-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX11-NEXT:    s_and_b32 s6, s2, 7
+; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX11-NEXT:    s_and_b32 s5, s6, 7
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-NEXT:    s_and_not1_b32 s6, 7, s6
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX11-NEXT:    s_and_b32 s6, s5, 7
-; GFX11-NEXT:    s_and_not1_b32 s5, 7, s5
-; GFX11-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX11-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX11-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_lshl_b32 s3, s3, s5
+; GFX11-NEXT:    s_lshr_b32 s4, s4, s6
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX11-NEXT:    s_or_b32 s2, s3, s4
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
@@ -837,20 +861,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT:    v_not_b32_e32 v2, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, 7, v5
-; GFX8-NEXT:    v_not_b32_e32 v2, v5
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v2, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v5
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -863,20 +887,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 1
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 7, v5
-; GFX9-NEXT:    v_not_b32_e32 v2, v5
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v2, v3
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v5
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -886,24 +910,24 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-LABEL: v_fshl_v2i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX10-NEXT:    v_not_b32_e32 v7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 7, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX10-NEXT:    v_lshrrev_b16 v4, 1, v4
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
-; GFX10-NEXT:    v_lshlrev_b16 v3, v3, v5
+; GFX10-NEXT:    v_lshrrev_b16 v3, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX10-NEXT:    v_lshlrev_b16 v4, v4, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT:    v_lshrrev_b16 v4, v6, v4
+; GFX10-NEXT:    v_lshrrev_b16 v3, v6, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v7, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -913,26 +937,26 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX11-LABEL: v_fshl_v2i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX11-NEXT:    v_not_b32_e32 v7, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_xor_b32_e32 v6, -1, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 7, v4
 ; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX11-NEXT:    v_lshrrev_b16 v4, 1, v4
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 1, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, v3, v5
+; GFX11-NEXT:    v_lshrrev_b16 v3, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX11-NEXT:    v_lshlrev_b16 v4, v4, v5
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b16 v4, v6, v4
+; GFX11-NEXT:    v_lshrrev_b16 v3, v6, v3
 ; GFX11-NEXT:    v_lshrrev_b16 v1, v7, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
@@ -1002,13 +1026,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX8-NEXT:    s_and_b32 s12, s2, 7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
@@ -1016,29 +1042,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s9, 7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s2, s6, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX8-NEXT:    s_and_b32 s3, s6, 0xff
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_andn2_b32 s3, 7, s9
+; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_andn2_b32 s2, 7, s9
-; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s2, s10, 7
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s3, s7, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s2, s4, s2
-; GFX8-NEXT:    s_and_b32 s4, s7, 0xff
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_andn2_b32 s4, 7, s10
+; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT:    s_andn2_b32 s3, 7, s10
-; GFX8-NEXT:    s_lshr_b32 s4, s4, 1
-; GFX8-NEXT:    s_lshr_b32 s3, s4, s3
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX8-NEXT:    s_or_b32 s2, s2, s3
 ; GFX8-NEXT:    s_and_b32 s3, s11, 7
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s3, s5, s3
-; GFX8-NEXT:    s_lshr_b32 s5, s8, 1
+; GFX8-NEXT:    s_andn2_b32 s5, 7, s11
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_lshr_b32 s4, s8, 1
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_lshr_b32 s4, s5, s4
+; GFX8-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
@@ -1055,13 +1087,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX9-NEXT:    s_and_b32 s12, s2, 7
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
@@ -1069,29 +1103,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s9, 7
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_and_b32 s2, s6, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX9-NEXT:    s_and_b32 s3, s6, 0xff
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_andn2_b32 s3, 7, s9
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_andn2_b32 s2, 7, s9
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_and_b32 s2, s10, 7
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_and_b32 s3, s7, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s2, s4, s2
-; GFX9-NEXT:    s_and_b32 s4, s7, 0xff
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_andn2_b32 s4, 7, s10
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX9-NEXT:    s_andn2_b32 s3, 7, s10
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 1
-; GFX9-NEXT:    s_lshr_b32 s3, s4, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX9-NEXT:    s_or_b32 s2, s2, s3
 ; GFX9-NEXT:    s_and_b32 s3, s11, 7
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshl_b32 s3, s5, s3
-; GFX9-NEXT:    s_lshr_b32 s5, s8, 1
+; GFX9-NEXT:    s_andn2_b32 s5, 7, s11
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_lshr_b32 s4, s8, 1
+; GFX9-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX9-NEXT:    s_lshr_b32 s4, s5, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX9-NEXT:    s_or_b32 s3, s3, s4
@@ -1108,48 +1148,56 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX10-NEXT:    s_and_b32 s11, s2, 7
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX10-NEXT:    s_and_b32 s12, s2, 7
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_andn2_b32 s12, 7, s2
+; GFX10-NEXT:    s_and_b32 s11, 0xffff, s11
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
-; GFX10-NEXT:    s_and_b32 s2, s6, 0xff
-; GFX10-NEXT:    s_and_b32 s6, s9, 7
-; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s12
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX10-NEXT:    s_lshr_b32 s2, s2, s9
+; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s11
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_or_b32 s1, s3, s2
-; GFX10-NEXT:    s_and_b32 s2, s7, 0xff
-; GFX10-NEXT:    s_and_b32 s3, s10, 7
-; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_andn2_b32 s6, 7, s10
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX10-NEXT:    s_and_b32 s1, s9, 7
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX10-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX10-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX10-NEXT:    s_lshr_b32 s3, s6, s9
+; GFX10-NEXT:    s_and_b32 s6, s10, 7
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s6
+; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX10-NEXT:    s_lshr_b32 s2, s2, s6
-; GFX10-NEXT:    s_and_b32 s4, s11, 7
-; GFX10-NEXT:    s_andn2_b32 s6, 7, s11
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s6
+; GFX10-NEXT:    s_andn2_b32 s6, 7, s10
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_and_b32 s7, s2, 7
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s7
 ; GFX10-NEXT:    s_lshr_b32 s7, s8, 1
-; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX10-NEXT:    s_lshr_b32 s5, s7, s6
-; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_lshl_b32 s5, s5, s6
+; GFX10-NEXT:    s_lshr_b32 s2, s7, s2
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_or_b32 s3, s4, s5
+; GFX10-NEXT:    s_or_b32 s2, s5, s2
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX10-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX10-NEXT:    s_lshl_b32 s1, s3, 16
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -1161,48 +1209,56 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX11-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX11-NEXT:    s_and_b32 s11, s2, 7
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX11-NEXT:    s_and_b32 s12, s2, 7
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_not1_b32 s12, 7, s2
+; GFX11-NEXT:    s_and_b32 s11, 0xffff, s11
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
-; GFX11-NEXT:    s_and_b32 s2, s6, 0xff
-; GFX11-NEXT:    s_and_b32 s6, s9, 7
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_and_not1_b32 s9, 7, s9
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s12
-; GFX11-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX11-NEXT:    s_lshr_b32 s2, s2, s9
+; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s11
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-NEXT:    s_and_b32 s2, s7, 0xff
-; GFX11-NEXT:    s_and_b32 s3, s10, 7
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_and_not1_b32 s6, 7, s10
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX11-NEXT:    s_and_b32 s1, s9, 7
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_and_not1_b32 s9, 7, s9
+; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX11-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX11-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX11-NEXT:    s_lshr_b32 s3, s6, s9
+; GFX11-NEXT:    s_and_b32 s6, s10, 7
+; GFX11-NEXT:    s_or_b32 s1, s1, s3
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s6
+; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX11-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX11-NEXT:    s_lshr_b32 s2, s2, s6
-; GFX11-NEXT:    s_and_b32 s4, s11, 7
-; GFX11-NEXT:    s_and_not1_b32 s6, 7, s11
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s6
+; GFX11-NEXT:    s_and_not1_b32 s6, 7, s10
+; GFX11-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_and_b32 s7, s2, 7
+; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s7
 ; GFX11-NEXT:    s_lshr_b32 s7, s8, 1
-; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX11-NEXT:    s_lshr_b32 s5, s7, s6
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_lshl_b32 s5, s5, s6
+; GFX11-NEXT:    s_lshr_b32 s2, s7, s2
+; GFX11-NEXT:    s_or_b32 s3, s3, s4
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-NEXT:    s_or_b32 s2, s5, s2
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX11-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s3, 16
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1271,37 +1327,38 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-LABEL: v_fshl_v4i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_not_b32_e32 v7, v2
-; GFX8-NEXT:    v_mov_b32_e32 v9, 1
+; GFX8-NEXT:    v_mov_b32_e32 v8, 1
+; GFX8-NEXT:    v_xor_b32_e32 v10, -1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v6, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v7, v7, v10
+; GFX8-NEXT:    v_lshrrev_b16_e32 v9, v10, v9
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 7, v5
-; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 7, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v7, v3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v9, v3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0xff
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 7
+; GFX8-NEXT:    v_mov_b32_e32 v9, -1
 ; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_xor_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 1, v8
+; GFX8-NEXT:    v_xor_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 1, v7
+; GFX8-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v7, v7, v8
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, v10, v7
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
@@ -1320,46 +1377,47 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-LABEL: v_fshl_v4i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_not_b32_e32 v7, v2
-; GFX9-NEXT:    v_mov_b32_e32 v9, 1
+; GFX9-NEXT:    v_mov_b32_e32 v8, 1
+; GFX9-NEXT:    v_xor_b32_e32 v10, -1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v0
-; GFX9-NEXT:    v_lshrrev_b16_e32 v7, v7, v10
+; GFX9-NEXT:    v_lshrrev_b16_e32 v9, v10, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX9-NEXT:    v_and_b32_e32 v7, 7, v5
-; GFX9-NEXT:    v_not_b32_e32 v5, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX9-NEXT:    v_and_b32_e32 v9, 7, v5
+; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v7, v3
+; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v9, v3
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0xff
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 7
+; GFX9-NEXT:    v_mov_b32_e32 v10, -1
 ; GFX9-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_xor_b32_sdwa v11, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT:    v_lshrrev_b16_e32 v10, 1, v10
+; GFX9-NEXT:    v_xor_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b16_e32 v9, 1, v9
+; GFX9-NEXT:    v_and_b32_e32 v11, 7, v11
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshrrev_b16_e32 v7, v7, v10
+; GFX9-NEXT:    v_lshrrev_b16_e32 v9, v11, v9
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
-; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v9
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v5
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v6, v8, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v6, v7, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
@@ -1368,41 +1426,42 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-LABEL: v_fshl_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX10-NEXT:    v_and_b32_e32 v9, 7, v2
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xff, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xff, v1
+; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v2
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v6
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_not_b32_e32 v12, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, v9, v0
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT:    v_lshrrev_b16 v9, 1, v11
-; GFX10-NEXT:    v_and_b32_e32 v11, 7, v12
-; GFX10-NEXT:    v_mov_b32_e32 v12, 0xff
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-NEXT:    v_lshlrev_b16 v3, v7, v3
-; GFX10-NEXT:    v_mov_b32_e32 v7, 7
-; GFX10-NEXT:    v_not_b32_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_not_b32_sdwa v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT:    v_not_b32_e32 v8, v2
-; GFX10-NEXT:    v_lshrrev_b16 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_sdwa v14, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v13, 7, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
+; GFX10-NEXT:    v_lshrrev_b16 v8, 1, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 7, v10
+; GFX10-NEXT:    v_lshlrev_b16 v3, v11, v3
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v11, -1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v1
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX10-NEXT:    v_mov_b32_e32 v13, 7
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_xor_b32_sdwa v10, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_xor_b32_sdwa v11, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b16 v7, 1, v7
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_sdwa v14, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v12
-; GFX10-NEXT:    v_lshrrev_b16 v10, 1, v10
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
-; GFX10-NEXT:    v_lshrrev_b16 v6, v11, v6
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b16 v12, 1, v12
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
+; GFX10-NEXT:    v_lshrrev_b16 v6, v6, v7
 ; GFX10-NEXT:    v_lshlrev_b16 v4, v14, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v13, v1
+; GFX10-NEXT:    v_lshrrev_b16 v1, v10, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v2, v2, v5
-; GFX10-NEXT:    v_lshrrev_b16 v5, v7, v10
-; GFX10-NEXT:    v_lshrrev_b16 v7, v8, v9
+; GFX10-NEXT:    v_lshrrev_b16 v5, v11, v12
+; GFX10-NEXT:    v_lshrrev_b16 v7, v9, v8
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 8
 ; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -1426,7 +1485,7 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_not_b32_e32 v13, v9
+; GFX11-NEXT:    v_xor_b32_e32 v13, -1, v9
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX11-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
@@ -1434,22 +1493,22 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX11-NEXT:    v_and_b32_e32 v13, 7, v13
 ; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX11-NEXT:    v_lshlrev_b16 v3, v9, v3
-; GFX11-NEXT:    v_not_b32_e32 v9, v10
+; GFX11-NEXT:    v_xor_b32_e32 v9, -1, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b16 v6, v13, v6
-; GFX11-NEXT:    v_not_b32_e32 v13, v11
+; GFX11-NEXT:    v_xor_b32_e32 v13, -1, v11
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX11-NEXT:    v_and_b32_e32 v12, 7, v2
-; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v10, 7, v10
-; GFX11-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX11-NEXT:    v_lshrrev_b16 v7, 1, v7
+; GFX11-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX11-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX11-NEXT:    v_and_b32_e32 v13, 7, v13
 ; GFX11-NEXT:    v_lshrrev_b16 v8, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT:    v_and_b32_e32 v13, 7, v13
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX11-NEXT:    v_lshlrev_b16 v4, v10, v4
 ; GFX11-NEXT:    v_lshrrev_b16 v6, v9, v7
@@ -5087,23 +5146,48 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 }
 
 define amdgpu_ps i64 @s_fshl_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) {
-; GCN-LABEL: s_fshl_i64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b64 s[6:7], s[4:5], 63
-; GCN-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], s6
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_fshl_i64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX6-NEXT:    s_not_b32 s4, s4
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX8-NEXT:    s_not_b32 s4, s4
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT:    s_not_b32 s4, s4
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT:    s_not_b32 s5, s4
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 63
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 63, s[4:5]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s6
-; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT:    s_not_b32 s5, s4
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -5181,8 +5265,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX6-NEXT:    v_not_b32_e32 v4, v4
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
+; GFX6-NEXT:    v_not_b32_e32 v4, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v5
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
@@ -5194,8 +5278,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
@@ -5207,8 +5291,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX9-NEXT:    v_not_b32_e32 v4, v4
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT:    v_not_b32_e32 v4, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
@@ -5362,36 +5446,36 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_and_b32_e32 v1, 63, v0
 ; GFX6-NEXT:    v_not_b32_e32 v0, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v1
+; GFX6-NEXT:    v_lshl_b64 v[1:2], s[0:1], v1
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
-; GFX6-NEXT:    v_lshr_b64 v[2:3], s[0:1], v2
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT:    v_lshr_b64 v[3:4], s[0:1], v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i64_ssv:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_and_b32_e32 v1, 63, v0
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v1, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v1, s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT:    v_lshrrev_b64 v[3:4], v0, s[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i64_ssv:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_and_b32_e32 v1, 63, v0
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v1, s[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v1, s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT:    v_lshrrev_b64 v[3:4], v0, s[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX9-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i64_ssv:
@@ -5429,10 +5513,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ; GFX6-LABEL: v_fshl_i64_svs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], s2
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], s3
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -5440,10 +5523,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ; GFX8-LABEL: v_fshl_i64_svs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s2, v[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -5451,10 +5533,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ; GFX9-LABEL: v_fshl_i64_svs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s2, v[0:1]
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -5462,10 +5543,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ; GFX10-LABEL: v_fshl_i64_svs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 63, s[2:3]
-; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], 63
+; GFX10-NEXT:    s_andn2_b32 s3, 63, s2
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -5473,13 +5553,12 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ; GFX11-LABEL: v_fshl_i64_svs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 63, s[2:3]
-; GFX11-NEXT:    s_and_b64 s[2:3], s[2:3], 63
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_and_not1_b32 s3, 63, s2
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5490,10 +5569,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg %amt) {
 ; GFX6-LABEL: v_fshl_i64_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s4
+; GFX6-NEXT:    s_and_b32 s3, s2, 63
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s3
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX6-NEXT:    s_not_b32 s2, s2
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -5501,10 +5580,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ;
 ; GFX8-LABEL: v_fshl_i64_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX8-NEXT:    s_and_b32 s3, s2, 63
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX8-NEXT:    s_not_b32 s2, s2
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -5512,10 +5591,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ;
 ; GFX9-LABEL: v_fshl_i64_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX9-NEXT:    s_and_b32 s3, s2, 63
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX9-NEXT:    s_not_b32 s2, s2
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -5523,10 +5602,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ;
 ; GFX10-LABEL: v_fshl_i64_vss:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX10-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    s_and_b32 s3, s2, 63
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX10-NEXT:    s_not_b32 s2, s2
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -5534,10 +5613,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ;
 ; GFX11-LABEL: v_fshl_i64_vss:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX11-NEXT:    s_and_not1_b64 s[2:3], 63, s[2:3]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX11-NEXT:    s_and_b32 s3, s2, 63
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX11-NEXT:    s_not_b32 s2, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -5553,80 +5632,70 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 define amdgpu_ps <2 x i64> @s_fshl_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) {
 ; GFX6-LABEL: s_fshl_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX6-NEXT:    s_not_b32 s8, s8
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX6-NEXT:    s_not_b32 s6, s10
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX8-NEXT:    s_not_b32 s8, s8
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX8-NEXT:    s_not_b32 s6, s10
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX9-NEXT:    s_not_b32 s8, s8
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX9-NEXT:    s_not_b32 s6, s10
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX10-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
+; GFX10-NEXT:    s_not_b32 s9, s8
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
-; GFX10-NEXT:    s_and_b64 s[8:9], s[10:11], 63
-; GFX10-NEXT:    s_andn2_b64 s[10:11], 63, s[10:11]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
+; GFX10-NEXT:    s_not_b32 s8, s10
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s9
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s8
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], 63, s[8:9]
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
+; GFX11-NEXT:    s_not_b32 s9, s8
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT:    s_and_b64 s[8:9], s[10:11], 63
-; GFX11-NEXT:    s_and_not1_b64 s[10:11], 63, s[10:11]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
+; GFX11-NEXT:    s_not_b32 s8, s10
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s9
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s8
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -5639,18 +5708,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], 1
+; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v9
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
-; GFX6-NEXT:    v_not_b32_e32 v8, v10
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], 1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
-; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v8
+; GFX6-NEXT:    v_not_b32_e32 v4, v10
+; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v7
@@ -5660,18 +5729,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
+; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
-; GFX8-NEXT:    v_not_b32_e32 v8, v10
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v8, v[6:7]
+; GFX8-NEXT:    v_not_b32_e32 v4, v10
+; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v7
@@ -5681,18 +5750,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
+; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT:    v_not_b32_e32 v8, v10
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v8, v[6:7]
+; GFX9-NEXT:    v_not_b32_e32 v4, v10
+; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v7
@@ -5750,231 +5819,236 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: s_fshl_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX6-NEXT:    s_sub_i32 s9, s10, 64
-; GFX6-NEXT:    s_sub_i32 s11, 64, s10
-; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX6-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX6-NEXT:    s_sub_i32 s11, s9, 64
+; GFX6-NEXT:    s_sub_i32 s14, 64, s9
+; GFX6-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX6-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[0:1], s14
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[2:3], s8
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[0:1], s8
+; GFX6-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_mov_b32 s12, 0
+; GFX6-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX6-NEXT:    s_mov_b32 s10, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX6-NEXT:    s_lshl_b32 s13, s6, 31
-; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX6-NEXT:    s_lshl_b32 s11, s6, 31
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX6-NEXT:    s_sub_i32 s12, s8, 64
-; GFX6-NEXT:    s_sub_i32 s10, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_andn2_b32 s6, 0x7f, s8
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX6-NEXT:    s_not_b32 s9, s8
+; GFX6-NEXT:    s_sub_i32 s14, s6, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s6
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[4:5], s9
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], s9
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX6-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX8-NEXT:    s_sub_i32 s9, s10, 64
-; GFX8-NEXT:    s_sub_i32 s11, 64, s10
-; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX8-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX8-NEXT:    s_sub_i32 s11, s9, 64
+; GFX8-NEXT:    s_sub_i32 s14, 64, s9
+; GFX8-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX8-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[0:1], s14
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[2:3], s8
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[0:1], s8
+; GFX8-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX8-NEXT:    s_mov_b32 s12, 0
+; GFX8-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX8-NEXT:    s_mov_b32 s10, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX8-NEXT:    s_lshl_b32 s13, s6, 31
-; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX8-NEXT:    s_lshl_b32 s11, s6, 31
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX8-NEXT:    s_sub_i32 s12, s8, 64
-; GFX8-NEXT:    s_sub_i32 s10, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_andn2_b32 s6, 0x7f, s8
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX8-NEXT:    s_not_b32 s9, s8
+; GFX8-NEXT:    s_sub_i32 s14, s6, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s6
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[4:5], s9
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[0:1], s9
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX8-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX9-NEXT:    s_sub_i32 s9, s10, 64
-; GFX9-NEXT:    s_sub_i32 s11, 64, s10
-; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX9-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX9-NEXT:    s_sub_i32 s11, s9, 64
+; GFX9-NEXT:    s_sub_i32 s14, 64, s9
+; GFX9-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX9-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[0:1], s14
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[2:3], s8
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[0:1], s8
+; GFX9-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_mov_b32 s12, 0
+; GFX9-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX9-NEXT:    s_mov_b32 s10, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX9-NEXT:    s_lshl_b32 s13, s6, 31
-; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX9-NEXT:    s_lshl_b32 s11, s6, 31
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX9-NEXT:    s_sub_i32 s12, s8, 64
-; GFX9-NEXT:    s_sub_i32 s10, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_andn2_b32 s6, 0x7f, s8
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX9-NEXT:    s_not_b32 s9, s8
+; GFX9-NEXT:    s_sub_i32 s14, s6, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s6
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[4:5], s9
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[0:1], s9
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX9-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX10-NEXT:    s_sub_i32 s9, s10, 64
-; GFX10-NEXT:    s_sub_i32 s11, 64, s10
-; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_mov_b32 s12, 0
-; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX10-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX10-NEXT:    s_mov_b32 s10, 0
+; GFX10-NEXT:    s_sub_i32 s11, s9, 64
+; GFX10-NEXT:    s_sub_i32 s12, 64, s9
+; GFX10-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
-; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[0:1], s12
+; GFX10-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[0:1], s8
+; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX10-NEXT:    s_cselect_b64 s[14:15], s[16:17], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX10-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX10-NEXT:    s_lshl_b32 s11, s6, 31
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX10-NEXT:    s_sub_i32 s14, s8, 64
-; GFX10-NEXT:    s_sub_i32 s9, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_andn2_b32 s6, 0x7f, s8
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_not_b32 s10, s8
+; GFX10-NEXT:    s_sub_i32 s12, s6, 64
+; GFX10-NEXT:    s_sub_i32 s8, 64, s6
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX10-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s10
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
-; GFX10-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], 0x7f, s[8:9]
-; GFX11-NEXT:    s_sub_i32 s9, s10, 64
-; GFX11-NEXT:    s_sub_i32 s11, 64, s10
-; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_mov_b32 s12, 0
-; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX11-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX11-NEXT:    s_mov_b32 s10, 0
+; GFX11-NEXT:    s_sub_i32 s11, s9, 64
+; GFX11-NEXT:    s_sub_i32 s12, 64, s9
+; GFX11-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
-; GFX11-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[12:13], s[0:1], s12
+; GFX11-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[0:1], s8
+; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX11-NEXT:    s_cselect_b64 s[14:15], s[16:17], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX11-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX11-NEXT:    s_lshl_b32 s11, s6, 31
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX11-NEXT:    s_sub_i32 s14, s8, 64
-; GFX11-NEXT:    s_sub_i32 s9, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX11-NEXT:    s_and_not1_b32 s6, 0x7f, s8
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX11-NEXT:    s_not_b32 s10, s8
+; GFX11-NEXT:    s_sub_i32 s12, s6, 64
+; GFX11-NEXT:    s_sub_i32 s8, 64, s6
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX11-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s10
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
 ; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
-; GFX11-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
+; GFX11-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -5985,143 +6059,143 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshl_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v14
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v14
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v8
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v14
-; GFX6-NEXT:    v_lshl_b64 v[12:13], v[0:1], v14
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 64, v15
+; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v15
+; GFX6-NEXT:    v_lshr_b64 v[9:10], v[0:1], v9
+; GFX6-NEXT:    v_lshl_b64 v[11:12], v[2:3], v15
+; GFX6-NEXT:    v_lshl_b64 v[13:14], v[0:1], v15
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v16
-; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX6-NEXT:    v_or_b32_e32 v10, v10, v12
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v13, v1, v3, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], 1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 31, v6
+; GFX6-NEXT:    v_not_b32_e32 v4, v8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[6:7], 1
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v15
-; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v15
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], v15
+; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v4
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v14
+; GFX6-NEXT:    v_subrev_i32_e32 v15, vcc, 64, v14
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], v14
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], v6
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], v15
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v14
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], v14
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v15
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v12, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v13, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v14
-; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v14
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v14, v[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[12:13], v14, v[0:1]
+; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, 64, v15
+; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v15
+; GFX8-NEXT:    v_lshrrev_b64 v[9:10], v9, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[13:14], v15, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX8-NEXT:    v_or_b32_e32 v10, v10, v12
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 31, v6
+; GFX8-NEXT:    v_not_b32_e32 v4, v8
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[6:7]
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v15
-; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v15
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v15, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v4
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v14
+; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, 64, v14
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v14, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v14, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v15, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v12, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v13, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v14
-; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v14
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v14, v[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v14, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v9, 64, v15
+; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v15
+; GFX9-NEXT:    v_lshrrev_b64 v[9:10], v9, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[13:14], v15, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v16, v[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX9-NEXT:    v_or_b32_e32 v10, v10, v12
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v1, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v1, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v8, v3, vcc
+; GFX9-NEXT:    v_not_b32_e32 v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v9, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[6:7]
+; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v4
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 31, v1
-; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v15
-; GFX9-NEXT:    v_subrev_u32_e32 v14, 64, v15
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v15, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v14
+; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v14, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v14, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v15, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX9-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, v12, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v13, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6129,15 +6203,15 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v18, 0x7f, v8
-; GFX10-NEXT:    v_not_b32_e32 v8, v8
+; GFX10-NEXT:    v_not_b32_e32 v10, v8
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[12:13], 1, v[6:7]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
-; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX10-NEXT:    v_sub_nc_u32_e32 v11, 64, v18
+; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v10
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 31, v5
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
-; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v11, v[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v18, v[0:1]
 ; GFX10-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
@@ -6175,43 +6249,43 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX11-LABEL: v_fshl_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0x7f, v8
-; GFX11-NEXT:    v_not_b32_e32 v8, v8
+; GFX11-NEXT:    v_not_b32_e32 v10, v8
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 1, v[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
-; GFX11-NEXT:    v_lshl_or_b32 v5, v6, 31, v5
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v18, v[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 64, v18
+; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v10
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
+; GFX11-NEXT:    v_lshl_or_b32 v5, v6, 31, v5
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v11, v[0:1]
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v18, v[0:1]
 ; GFX11-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v19
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
 ; GFX11-NEXT:    v_or_b32_e32 v10, v10, v8
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
 ; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v16, v[12:13]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX11-NEXT:    v_or_b32_e32 v11, v11, v9
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v19
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v8, v[12:13]
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
 ; GFX11-NEXT:    v_or_b32_e32 v14, v14, v16
 ; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v19, v[12:13]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v18
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v6 :: v_dual_cndmask_b32 v7, 0, v7
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s0
@@ -6229,173 +6303,173 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshl_i128_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX6-NEXT:    v_not_b32_e32 v0, v0
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v0
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v6
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v0
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v6
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v6
-; GFX6-NEXT:    v_lshl_b64 v[4:5], s[0:1], v6
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 64, v7
+; GFX6-NEXT:    v_lshr_b64 v[1:2], s[0:1], v1
+; GFX6-NEXT:    v_lshl_b64 v[3:4], s[2:3], v7
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v7
+; GFX6-NEXT:    v_lshl_b64 v[5:6], s[0:1], v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v8
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX6-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX6-NEXT:    v_lshl_b64 v[1:2], s[0:1], v8
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX6-NEXT:    v_not_b32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s8, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_mov_b32_e32 v3, s3
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v3, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX6-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX6-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v7
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v7
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v10
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v10
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v7
+; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v10
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[2:3], v11
-; GFX6-NEXT:    v_lshr_b64 v[4:5], s[2:3], v7
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX6-NEXT:    v_lshr_b64 v[4:5], s[2:3], v10
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i128_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v0
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v6
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 64, v7
+; GFX8-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v7
+; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v7, s[0:1]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX8-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, s[0:1]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    s_mov_b32 s8, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX8-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX8-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v7
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v7, s[0:1]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v10
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v10, s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v10
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v7, s[2:3]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v10, s[2:3]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i128_ssv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v6
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v6
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v7, s[0:1]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX9-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, s[0:1]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX9-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v7, s[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v10, s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v7
+; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v10
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v7, s[2:3]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v10, s[2:3]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX9-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i128_ssv:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX10-NEXT:    v_not_b32_e32 v0, v0
+; GFX10-NEXT:    v_not_b32_e32 v2, v0
 ; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; GFX10-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT:    v_and_b32_e32 v13, 0x7f, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 64, v12
+; GFX10-NEXT:    v_and_b32_e32 v13, 0x7f, v2
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
 ; GFX10-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v3, s[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, s[8:9]
@@ -6434,58 +6508,52 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX11-LABEL: v_fshl_i128_ssv:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX11-NEXT:    v_not_b32_e32 v0, v0
+; GFX11-NEXT:    v_not_b32_e32 v2, v0
 ; GFX11-NEXT:    s_mov_b32 s8, 0
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; GFX11-NEXT:    s_lshl_b32 s9, s6, 31
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v12, s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0x7f, v2
 ; GFX11-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
-; GFX11-NEXT:    v_and_b32_e32 v13, 0x7f, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, 0, v4 :: v_dual_cndmask_b32 v5, 0, v5
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 64, v12
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
 ; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v13, s[8:9]
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v3, s[0:1]
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
+; GFX11-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, 64, v13
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v1
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, 64, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v7, v7, v9
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v0, s[6:7]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v0, s[6:7]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v8, s2, s4
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, v10, s3, s4
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s8, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s9, s1
-; GFX11-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX11-NEXT:    v_or_b32_e32 v2, v6, v2
 ; GFX11-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6495,43 +6563,43 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshl_i128_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX6-NEXT:    s_sub_i32 s5, s6, 64
-; GFX6-NEXT:    s_sub_i32 s7, 64, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX6-NEXT:    s_sub_i32 s12, s5, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s5
+; GFX6-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX6-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], s4
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 31, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    s_sub_i32 s0, s4, 64
-; GFX6-NEXT:    s_sub_i32 s1, 64, s4
+; GFX6-NEXT:    s_sub_i32 s1, s0, 64
+; GFX6-NEXT:    s_sub_i32 s4, 64, s0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s4
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
-; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s4
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s0
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s0
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s1
 ; GFX6-NEXT:    s_and_b32 s0, 1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s6
+; GFX6-NEXT:    s_and_b32 s0, 1, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6539,51 +6607,51 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s9, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i128_svs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX8-NEXT:    s_sub_i32 s5, s6, 64
-; GFX8-NEXT:    s_sub_i32 s7, 64, s6
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX8-NEXT:    s_sub_i32 s12, s5, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s5
+; GFX8-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX8-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], s4
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 31, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    s_sub_i32 s0, s4, 64
-; GFX8-NEXT:    s_sub_i32 s1, 64, s4
+; GFX8-NEXT:    s_sub_i32 s1, s0, 64
+; GFX8-NEXT:    s_sub_i32 s4, 64, s0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s5
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s6
+; GFX8-NEXT:    s_and_b32 s0, 1, s8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6591,50 +6659,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s9, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i128_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX9-NEXT:    s_sub_i32 s5, s6, 64
-; GFX9-NEXT:    s_sub_i32 s7, 64, s6
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX9-NEXT:    s_sub_i32 s12, s5, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s5
+; GFX9-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX9-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], s4
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    s_sub_i32 s0, s4, 64
-; GFX9-NEXT:    s_sub_i32 s1, 64, s4
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_sub_i32 s1, s0, 64
+; GFX9-NEXT:    s_sub_i32 s4, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s5
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s6
+; GFX9-NEXT:    s_and_b32 s0, 1, s8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6642,50 +6710,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s9, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i128_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT:    s_sub_i32 s5, s6, 64
-; GFX10-NEXT:    s_sub_i32 s7, 64, s6
-; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    s_and_b32 s5, s4, 0x7f
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    s_sub_i32 s12, s5, 64
+; GFX10-NEXT:    s_sub_i32 s6, 64, s5
+; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
-; GFX10-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
-; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s4
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s4
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s0, 64, s4
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT:    s_sub_i32 s0, s4, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_andn2_b32 s0, 0x7f, s4
+; GFX10-NEXT:    s_sub_i32 s1, 64, s0
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX10-NEXT:    s_sub_i32 s1, s0, 64
+; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
+; GFX10-NEXT:    s_and_b32 s1, 1, s4
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
 ; GFX10-NEXT:    s_and_b32 s0, 1, s5
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
@@ -6695,62 +6763,62 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX10-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshl_i128_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT:    s_sub_i32 s5, s6, 64
-; GFX11-NEXT:    s_sub_i32 s7, 64, s6
-; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX11-NEXT:    s_and_b32 s5, s4, 0x7f
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    s_sub_i32 s12, s5, 64
+; GFX11-NEXT:    s_sub_i32 s6, 64, s5
+; GFX11-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
-; GFX11-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
-; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s4
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s4
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s0, 64, s4
+; GFX11-NEXT:    s_and_not1_b32 s0, 0x7f, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT:    s_sub_i32 s0, s4, 64
-; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_sub_i32 s1, 64, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX11-NEXT:    s_sub_i32 s1, s0, 64
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX11-NEXT:    s_and_b32 s0, 1, s1
+; GFX11-NEXT:    s_and_b32 s1, 1, s4
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
 ; GFX11-NEXT:    s_and_b32 s0, 1, s5
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6760,25 +6828,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshl_i128_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX6-NEXT:    s_sub_i32 s5, s6, 64
-; GFX6-NEXT:    s_sub_i32 s7, 64, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX6-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX6-NEXT:    s_sub_i32 s7, s5, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s5
+; GFX6-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s8, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s7
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s6
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s5
-; GFX6-NEXT:    s_and_b32 s5, 1, s9
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s8
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s5
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s7
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s6
+; GFX6-NEXT:    s_lshl_b32 s7, s2, 31
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s5
+; GFX6-NEXT:    s_and_b32 s5, 1, s9
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX6-NEXT:    s_not_b32 s6, s4
+; GFX6-NEXT:    s_andn2_b32 s4, 0x7f, s4
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX6-NEXT:    s_and_b32 s5, 1, s10
-; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_sub_i32 s10, s4, 64
 ; GFX6-NEXT:    s_sub_i32 s8, 64, s4
@@ -6793,19 +6862,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6814,25 +6883,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX8-LABEL: v_fshl_i128_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX8-NEXT:    s_sub_i32 s5, s6, 64
-; GFX8-NEXT:    s_sub_i32 s7, 64, s6
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX8-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX8-NEXT:    s_sub_i32 s7, s5, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s5
+; GFX8-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX8-NEXT:    s_mov_b32 s8, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX8-NEXT:    s_mov_b32 s6, 0
 ; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX8-NEXT:    s_and_b32 s5, 1, s9
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX8-NEXT:    s_lshl_b32 s7, s2, 31
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
+; GFX8-NEXT:    s_and_b32 s5, 1, s9
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT:    s_not_b32 s6, s4
+; GFX8-NEXT:    s_andn2_b32 s4, 0x7f, s4
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX8-NEXT:    s_and_b32 s5, 1, s10
-; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_sub_i32 s10, s4, 64
 ; GFX8-NEXT:    s_sub_i32 s8, 64, s4
@@ -6847,19 +6917,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6868,25 +6938,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX9-LABEL: v_fshl_i128_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX9-NEXT:    s_sub_i32 s5, s6, 64
-; GFX9-NEXT:    s_sub_i32 s7, 64, s6
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX9-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX9-NEXT:    s_sub_i32 s7, s5, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s5
+; GFX9-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX9-NEXT:    s_mov_b32 s8, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX9-NEXT:    s_and_b32 s5, 1, s9
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX9-NEXT:    s_lshl_b32 s7, s2, 31
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
+; GFX9-NEXT:    s_and_b32 s5, 1, s9
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT:    s_not_b32 s6, s4
+; GFX9-NEXT:    s_andn2_b32 s4, 0x7f, s4
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX9-NEXT:    s_and_b32 s5, 1, s10
-; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_sub_i32 s10, s4, 64
 ; GFX9-NEXT:    s_sub_i32 s8, 64, s4
@@ -6901,19 +6972,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6922,53 +6993,54 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX10-LABEL: v_fshl_i128_vss:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT:    s_sub_i32 s5, s6, 64
-; GFX10-NEXT:    s_sub_i32 s7, 64, s6
-; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX10-NEXT:    s_sub_i32 s6, s5, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s5
+; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX10-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX10-NEXT:    s_and_b32 s6, 1, s8
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s6, v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX10-NEXT:    s_lshl_b32 s7, s2, 31
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT:    s_and_b32 s5, 1, s9
+; GFX10-NEXT:    s_and_b32 s5, 1, s8
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_sub_i32 s10, s4, 64
-; GFX10-NEXT:    s_sub_i32 s8, 64, s4
+; GFX10-NEXT:    s_andn2_b32 s6, 0x7f, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    s_and_b32 s5, 1, s9
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT:    s_not_b32 s8, s4
+; GFX10-NEXT:    s_sub_i32 s10, s6, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s8
+; GFX10-NEXT:    s_lshl_b64 s[6:7], s[2:3], s7
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], 0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -6976,50 +7048,52 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX11-LABEL: v_fshl_i128_vss:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT:    s_sub_i32 s5, s6, 64
-; GFX11-NEXT:    s_sub_i32 s7, 64, s6
-; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX11-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s6, s5, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s5
+; GFX11-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
 ; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX11-NEXT:    s_and_b32 s6, 1, s8
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s6, v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX11-NEXT:    s_lshl_b32 s7, s2, 31
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT:    s_and_b32 s5, 1, s9
+; GFX11-NEXT:    s_and_b32 s5, 1, s8
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_sub_i32 s10, s4, 64
-; GFX11-NEXT:    s_sub_i32 s8, 64, s4
+; GFX11-NEXT:    s_and_not1_b32 s6, 0x7f, s4
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX11-NEXT:    s_and_b32 s5, 1, s9
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT:    s_not_b32 s8, s4
+; GFX11-NEXT:    s_sub_i32 s10, s6, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
 ; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], s8
+; GFX11-NEXT:    s_lshl_b64 s[6:7], s[2:3], s7
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[8:9], 0
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -7152,40 +7226,41 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshl_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX6-NEXT:    s_sub_i32 s17, s18, 64
-; GFX6-NEXT:    s_sub_i32 s19, 64, s18
-; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX6-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX6-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX6-NEXT:    s_sub_i32 s19, s17, 64
+; GFX6-NEXT:    s_sub_i32 s21, 64, s17
+; GFX6-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
-; GFX6-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX6-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX6-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX6-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[24:25], s[0:1], s21
+; GFX6-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[0:1], s16
+; GFX6-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
 ; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_mov_b32 s22, 0
+; GFX6-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_mov_b32 s18, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX6-NEXT:    s_lshl_b32 s23, s10, 31
-; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX6-NEXT:    s_lshl_b32 s19, s10, 31
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX6-NEXT:    s_sub_i32 s23, s16, 64
-; GFX6-NEXT:    s_sub_i32 s18, 64, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX6-NEXT:    s_andn2_b32 s10, 0x7f, s16
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX6-NEXT:    s_not_b32 s17, s16
+; GFX6-NEXT:    s_sub_i32 s19, s10, 64
+; GFX6-NEXT:    s_sub_i32 s21, 64, s10
+; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
-; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s17
+; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s17
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[8:9], s21
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s19
 ; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
@@ -7193,86 +7268,88 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX6-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
-; GFX6-NEXT:    s_sub_i32 s11, s8, 64
-; GFX6-NEXT:    s_sub_i32 s9, 64, s8
+; GFX6-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX6-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s19, s8, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
-; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[6:7], s20
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[4:5], s20
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX6-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX6-NEXT:    s_lshl_b32 s23, s14, 31
-; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX6-NEXT:    s_sub_i32 s18, s10, 64
-; GFX6-NEXT:    s_sub_i32 s14, 64, s10
-; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX6-NEXT:    s_lshl_b32 s19, s14, 31
+; GFX6-NEXT:    s_andn2_b32 s12, 0x7f, s20
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[14:15], 1
+; GFX6-NEXT:    s_not_b32 s14, s20
+; GFX6-NEXT:    s_sub_i32 s18, s12, 64
+; GFX6-NEXT:    s_sub_i32 s16, 64, s12
+; GFX6-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
+; GFX6-NEXT:    s_lshr_b64 s[12:13], s[10:11], s14
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[4:5], s14
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[10:11], s16
+; GFX6-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[14:15], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
+; GFX6-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX8-NEXT:    s_sub_i32 s17, s18, 64
-; GFX8-NEXT:    s_sub_i32 s19, 64, s18
-; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX8-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX8-NEXT:    s_sub_i32 s19, s17, 64
+; GFX8-NEXT:    s_sub_i32 s21, 64, s17
+; GFX8-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
-; GFX8-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX8-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX8-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX8-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[24:25], s[0:1], s21
+; GFX8-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[0:1], s16
+; GFX8-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
 ; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_mov_b32 s22, 0
+; GFX8-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX8-NEXT:    s_mov_b32 s18, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX8-NEXT:    s_lshl_b32 s23, s10, 31
-; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX8-NEXT:    s_lshl_b32 s19, s10, 31
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX8-NEXT:    s_sub_i32 s23, s16, 64
-; GFX8-NEXT:    s_sub_i32 s18, 64, s16
-; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX8-NEXT:    s_andn2_b32 s10, 0x7f, s16
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX8-NEXT:    s_not_b32 s17, s16
+; GFX8-NEXT:    s_sub_i32 s19, s10, 64
+; GFX8-NEXT:    s_sub_i32 s21, 64, s10
+; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
-; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s17
+; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s17
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[8:9], s21
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s19
 ; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
@@ -7280,86 +7357,88 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX8-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
-; GFX8-NEXT:    s_sub_i32 s11, s8, 64
-; GFX8-NEXT:    s_sub_i32 s9, 64, s8
+; GFX8-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX8-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s19, s8, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
-; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[6:7], s20
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[4:5], s20
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX8-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX8-NEXT:    s_lshl_b32 s23, s14, 31
-; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX8-NEXT:    s_sub_i32 s18, s10, 64
-; GFX8-NEXT:    s_sub_i32 s14, 64, s10
-; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX8-NEXT:    s_lshl_b32 s19, s14, 31
+; GFX8-NEXT:    s_andn2_b32 s12, 0x7f, s20
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[14:15], 1
+; GFX8-NEXT:    s_not_b32 s14, s20
+; GFX8-NEXT:    s_sub_i32 s18, s12, 64
+; GFX8-NEXT:    s_sub_i32 s16, 64, s12
+; GFX8-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
+; GFX8-NEXT:    s_lshr_b64 s[12:13], s[10:11], s14
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[4:5], s14
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[10:11], s16
+; GFX8-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[14:15], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
+; GFX8-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX9-NEXT:    s_sub_i32 s17, s18, 64
-; GFX9-NEXT:    s_sub_i32 s19, 64, s18
-; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX9-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX9-NEXT:    s_sub_i32 s19, s17, 64
+; GFX9-NEXT:    s_sub_i32 s21, 64, s17
+; GFX9-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
-; GFX9-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX9-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX9-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX9-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[24:25], s[0:1], s21
+; GFX9-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[0:1], s16
+; GFX9-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
 ; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_mov_b32 s22, 0
+; GFX9-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_mov_b32 s18, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX9-NEXT:    s_lshl_b32 s23, s10, 31
-; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX9-NEXT:    s_lshl_b32 s19, s10, 31
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX9-NEXT:    s_sub_i32 s23, s16, 64
-; GFX9-NEXT:    s_sub_i32 s18, 64, s16
-; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX9-NEXT:    s_andn2_b32 s10, 0x7f, s16
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX9-NEXT:    s_not_b32 s17, s16
+; GFX9-NEXT:    s_sub_i32 s19, s10, 64
+; GFX9-NEXT:    s_sub_i32 s21, 64, s10
+; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
-; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s17
+; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s17
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[8:9], s21
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s19
 ; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
@@ -7367,222 +7446,227 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX9-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
-; GFX9-NEXT:    s_sub_i32 s11, s8, 64
-; GFX9-NEXT:    s_sub_i32 s9, 64, s8
+; GFX9-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX9-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s19, s8, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[6:7], s20
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[4:5], s20
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX9-NEXT:    s_lshl_b32 s23, s14, 31
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX9-NEXT:    s_sub_i32 s18, s10, 64
-; GFX9-NEXT:    s_sub_i32 s14, 64, s10
-; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX9-NEXT:    s_lshl_b32 s19, s14, 31
+; GFX9-NEXT:    s_andn2_b32 s12, 0x7f, s20
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[14:15], 1
+; GFX9-NEXT:    s_not_b32 s14, s20
+; GFX9-NEXT:    s_sub_i32 s18, s12, 64
+; GFX9-NEXT:    s_sub_i32 s16, 64, s12
+; GFX9-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[10:11], s14
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[4:5], s14
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[10:11], s16
+; GFX9-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[14:15], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
+; GFX9-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX10-NEXT:    s_sub_i32 s17, s18, 64
-; GFX10-NEXT:    s_sub_i32 s19, 64, s18
-; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX10-NEXT:    s_mov_b32 s22, 0
-; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX10-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX10-NEXT:    s_mov_b32 s18, 0
+; GFX10-NEXT:    s_sub_i32 s19, s17, 64
+; GFX10-NEXT:    s_sub_i32 s21, 64, s17
+; GFX10-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
-; GFX10-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
-; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX10-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s21
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[2:3], s16
+; GFX10-NEXT:    s_lshl_b64 s[26:27], s[0:1], s16
+; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
 ; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX10-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX10-NEXT:    s_lshl_b32 s19, s10, 31
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s23, s16, 64
-; GFX10-NEXT:    s_sub_i32 s17, 64, s16
-; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX10-NEXT:    s_andn2_b32 s10, 0x7f, s16
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX10-NEXT:    s_not_b32 s19, s16
+; GFX10-NEXT:    s_sub_i32 s21, s10, 64
+; GFX10-NEXT:    s_sub_i32 s16, 64, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
-; GFX10-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s19
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
+; GFX10-NEXT:    s_lshr_b64 s[22:23], s[8:9], s19
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s21
 ; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
-; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
+; GFX10-NEXT:    s_and_b32 s10, s20, 0x7f
+; GFX10-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX10-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s11, s8, 64
-; GFX10-NEXT:    s_sub_i32 s9, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_sub_i32 s19, s10, 64
+; GFX10-NEXT:    s_sub_i32 s8, 64, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
+; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s20
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[4:5], s20
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX10-NEXT:    s_lshl_b32 s23, s14, 31
-; GFX10-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s18, s10, 64
-; GFX10-NEXT:    s_sub_i32 s11, 64, s10
-; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX10-NEXT:    s_lshl_b32 s19, s14, 31
+; GFX10-NEXT:    s_andn2_b32 s12, 0x7f, s20
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
+; GFX10-NEXT:    s_not_b32 s16, s20
+; GFX10-NEXT:    s_sub_i32 s18, s12, 64
+; GFX10-NEXT:    s_sub_i32 s14, 64, s12
+; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[4:5], s10
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[12:13], s11
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[12:13], s10
-; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX10-NEXT:    s_lshr_b64 s[12:13], s[12:13], s18
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[4:5], s16
+; GFX10-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[12:13], s[14:15], s[12:13]
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[12:13]
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
+; GFX10-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[16:17], 0x7f, s[16:17]
-; GFX11-NEXT:    s_sub_i32 s17, s18, 64
-; GFX11-NEXT:    s_sub_i32 s19, 64, s18
-; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX11-NEXT:    s_mov_b32 s22, 0
-; GFX11-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX11-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX11-NEXT:    s_mov_b32 s18, 0
+; GFX11-NEXT:    s_sub_i32 s19, s17, 64
+; GFX11-NEXT:    s_sub_i32 s21, 64, s17
+; GFX11-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
-; GFX11-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
-; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX11-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX11-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s21
+; GFX11-NEXT:    s_lshl_b64 s[24:25], s[2:3], s16
+; GFX11-NEXT:    s_lshl_b64 s[26:27], s[0:1], s16
+; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX11-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX11-NEXT:    s_lshl_b32 s19, s10, 31
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s23, s16, 64
-; GFX11-NEXT:    s_sub_i32 s17, 64, s16
-; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX11-NEXT:    s_and_not1_b32 s10, 0x7f, s16
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX11-NEXT:    s_not_b32 s19, s16
+; GFX11-NEXT:    s_sub_i32 s21, s10, 64
+; GFX11-NEXT:    s_sub_i32 s16, 64, s10
+; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
-; GFX11-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s19
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
+; GFX11-NEXT:    s_lshr_b64 s[22:23], s[8:9], s19
+; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s21
 ; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
-; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
+; GFX11-NEXT:    s_and_b32 s10, s20, 0x7f
+; GFX11-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX11-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s11, s8, 64
-; GFX11-NEXT:    s_sub_i32 s9, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX11-NEXT:    s_sub_i32 s19, s10, 64
+; GFX11-NEXT:    s_sub_i32 s8, 64, s10
+; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX11-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
+; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s20
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[4:5], s20
+; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX11-NEXT:    s_lshl_b32 s23, s14, 31
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s18, s10, 64
-; GFX11-NEXT:    s_sub_i32 s11, 64, s10
-; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX11-NEXT:    s_lshl_b32 s19, s14, 31
+; GFX11-NEXT:    s_and_not1_b32 s12, 0x7f, s20
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
+; GFX11-NEXT:    s_not_b32 s16, s20
+; GFX11-NEXT:    s_sub_i32 s18, s12, 64
+; GFX11-NEXT:    s_sub_i32 s14, 64, s12
+; GFX11-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[4:5], s10
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[12:13], s11
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[12:13], s10
-; GFX11-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[12:13], s18
+; GFX11-NEXT:    s_lshr_b64 s[12:13], s[4:5], s16
+; GFX11-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
+; GFX11-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
 ; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[12:13], s[14:15], s[12:13]
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
 ; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[12:13]
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
 ; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
+; GFX11-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
   ret <2 x i128> %result
@@ -7592,56 +7676,54 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-LABEL: v_fshl_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX6-NEXT:    v_not_b32_e32 v16, v16
-; GFX6-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v23
-; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v23
-; GFX6-NEXT:    v_lshr_b64 v[16:17], v[0:1], v16
-; GFX6-NEXT:    v_lshl_b64 v[18:19], v[2:3], v23
-; GFX6-NEXT:    v_lshl_b64 v[21:22], v[0:1], v23
+; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v16
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 64, v19
+; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v19
+; GFX6-NEXT:    v_lshr_b64 v[17:18], v[0:1], v17
+; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v19
+; GFX6-NEXT:    v_lshl_b64 v[23:24], v[0:1], v19
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v25
-; GFX6-NEXT:    v_or_b32_e32 v16, v16, v18
-; GFX6-NEXT:    v_or_b32_e32 v17, v17, v19
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX6-NEXT:    v_or_b32_e32 v17, v17, v21
+; GFX6-NEXT:    v_or_b32_e32 v18, v18, v22
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT:    v_cndmask_b32_e32 v21, 0, v23, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v22, 0, v24, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v17, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v18, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], 1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 31, v10
+; GFX6-NEXT:    v_not_b32_e32 v8, v16
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[10:11], 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v24
-; GFX6-NEXT:    v_subrev_i32_e32 v23, vcc, 64, v24
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v24
+; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v8
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v23
+; GFX6-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v23
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v23
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v10
-; GFX6-NEXT:    v_lshr_b64 v[16:17], v[2:3], v24
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v23
+; GFX6-NEXT:    v_lshr_b64 v[16:17], v[2:3], v23
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v24
 ; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX6-NEXT:    v_or_b32_e32 v0, v18, v0
-; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX6-NEXT:    v_not_b32_e32 v8, v20
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX6-NEXT:    v_or_b32_e32 v2, v18, v2
+; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX6-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v18
-; GFX6-NEXT:    v_subrev_i32_e32 v20, vcc, 64, v18
+; GFX6-NEXT:    v_or_b32_e32 v3, v19, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v19, vcc, 64, v18
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v8
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v18
 ; GFX6-NEXT:    v_lshl_b64 v[16:17], v[4:5], v18
-; GFX6-NEXT:    v_lshl_b64 v[4:5], v[4:5], v20
+; GFX6-NEXT:    v_lshl_b64 v[4:5], v[4:5], v19
 ; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
@@ -7651,88 +7733,88 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
 ; GFX6-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v19, v5, v7, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], 1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 31, v14
+; GFX6-NEXT:    v_not_b32_e32 v8, v20
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[14:15], 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v19
-; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v19
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v19
+; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v14
+; GFX6-NEXT:    v_subrev_i32_e32 v15, vcc, 64, v14
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v14
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v10
-; GFX6-NEXT:    v_lshr_b64 v[12:13], v[6:7], v19
-; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v14
+; GFX6-NEXT:    v_lshr_b64 v[12:13], v[6:7], v14
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v15
 ; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX6-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v22, v1
 ; GFX6-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX6-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX6-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX6-NEXT:    v_or_b32_e32 v7, v19, v7
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX8-NEXT:    v_not_b32_e32 v16, v16
-; GFX8-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v23
-; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v23
-; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v16, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[18:19], v23, v[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v23, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v16
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 64, v19
+; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v19
+; GFX8-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[23:24], v19, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v16, v16, v18
-; GFX8-NEXT:    v_or_b32_e32 v17, v17, v19
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX8-NEXT:    v_or_b32_e32 v17, v17, v21
+; GFX8-NEXT:    v_or_b32_e32 v18, v18, v22
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, 0, v23, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, 0, v24, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v18, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[8:9]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 31, v10
+; GFX8-NEXT:    v_not_b32_e32 v8, v16
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[10:11]
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v24
-; GFX8-NEXT:    v_subrev_u32_e32 v23, vcc, 64, v24
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v8
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v23
+; GFX8-NEXT:    v_subrev_u32_e32 v24, vcc, 64, v23
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v23, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v24, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v23, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v23, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v24, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v0, v18, v0
-; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX8-NEXT:    v_not_b32_e32 v8, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX8-NEXT:    v_or_b32_e32 v2, v18, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX8-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v18
-; GFX8-NEXT:    v_subrev_u32_e32 v20, vcc, 64, v18
+; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, 64, v18
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
 ; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v18, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v20, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v19, v[4:5]
 ; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
@@ -7742,87 +7824,87 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v5, v7, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 31, v14
+; GFX8-NEXT:    v_not_b32_e32 v8, v20
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v19
-; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v19
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v19, v[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v14
+; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, 64, v14
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v14, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v14, v[6:7]
+; GFX8-NEXT:    v_lshrrev_b64 v[12:13], v14, v[6:7]
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v15, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX8-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v22, v1
 ; GFX8-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX8-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, v19, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX9-NEXT:    v_not_b32_e32 v16, v16
-; GFX9-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX9-NEXT:    v_sub_u32_e32 v16, 64, v23
-; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v23
-; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v16, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[18:19], v23, v[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v23, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v16
+; GFX9-NEXT:    v_sub_u32_e32 v17, 64, v19
+; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v19
+; GFX9-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[23:24], v19, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v25, v[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v16, v16, v18
-; GFX9-NEXT:    v_or_b32_e32 v17, v17, v19
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v1, v17, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
+; GFX9-NEXT:    v_or_b32_e32 v17, v17, v21
+; GFX9-NEXT:    v_or_b32_e32 v18, v18, v22
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, 0, v23, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, 0, v24, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v1, v18, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v16, v3, vcc
+; GFX9-NEXT:    v_not_b32_e32 v8, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v17, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[10:11]
+; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v8
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v10, 31, v1
-; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v24
-; GFX9-NEXT:    v_subrev_u32_e32 v23, 64, v24
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v23
+; GFX9-NEXT:    v_subrev_u32_e32 v24, 64, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v23, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v24, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v23, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v23, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v24, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX9-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT:    v_or_b32_e32 v2, v18, v2
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX9-NEXT:    v_not_b32_e32 v8, v20
-; GFX9-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v18
-; GFX9-NEXT:    v_subrev_u32_e32 v20, 64, v18
+; GFX9-NEXT:    v_or_b32_e32 v3, v19, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v19, 64, v18
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v18, v[4:5]
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v20, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v19, v[4:5]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
@@ -7833,89 +7915,91 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
 ; GFX9-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v8, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v8, v7, vcc
+; GFX9-NEXT:    v_not_b32_e32 v8, v20
 ; GFX9-NEXT:    v_lshl_or_b32 v5, v14, 31, v5
-; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v19
-; GFX9-NEXT:    v_subrev_u32_e32 v14, 64, v19
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v19, v[4:5]
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
+; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v14
+; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v14, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
-; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v14, v[6:7]
+; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v14, v[6:7]
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v15, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX9-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX9-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, v22, v1
 ; GFX9-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX9-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, v19, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0x7f, v16
-; GFX10-NEXT:    v_not_b32_e32 v16, v16
+; GFX10-NEXT:    v_not_b32_e32 v21, v16
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
-; GFX10-NEXT:    v_and_b32_e32 v28, 0x7f, v16
+; GFX10-NEXT:    v_and_b32_e32 v28, 0x7f, v21
 ; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
 ; GFX10-NEXT:    v_lshl_or_b32 v9, v10, 31, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
-; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
 ; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
 ; GFX10-NEXT:    v_or_b32_e32 v18, v16, v18
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
 ; GFX10-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v19, v17, v19
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v28
-; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v23, v23, v25
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v0, v18, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v0, v24, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v1, v19, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v27
+; GFX10-NEXT:    v_or_b32_e32 v24, v24, v26
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v28
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v27
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v28
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v0, s4
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v23, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v17, v24, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v19, v3, s4
 ; GFX10-NEXT:    v_and_b32_e32 v24, 0x7f, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v19, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v16, v8, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v17, v9, s5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v9, s5
+; GFX10-NEXT:    v_not_b32_e32 v16, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v0, v21, v3
-; GFX10-NEXT:    v_not_b32_e32 v3, v20
 ; GFX10-NEXT:    v_or_b32_e32 v1, v22, v8
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v11, 64, v24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 64, v24
+; GFX10-NEXT:    v_and_b32_e32 v22, 0x7f, v16
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX10-NEXT:    v_and_b32_e32 v22, 0x7f, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[12:13], v24, v[6:7]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
-; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v11, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
 ; GFX10-NEXT:    v_lshl_or_b32 v9, v14, 31, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v22
-; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX10-NEXT:    v_or_b32_e32 v12, v10, v12
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v22
@@ -7953,88 +8037,87 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v27, 0x7f, v16
-; GFX11-NEXT:    v_not_b32_e32 v16, v16
+; GFX11-NEXT:    v_not_b32_e32 v21, v16
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0x7f, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v28, 0x7f, v21
+; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshl_or_b32 v9, v10, 31, v9
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX11-NEXT:    v_dual_cndmask_b32 v21, 0, v21 :: v_dual_cndmask_b32 v22, 0, v22
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc_lo
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
 ; GFX11-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v28
-; GFX11-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v19, v17, v19
 ; GFX11-NEXT:    v_or_b32_e32 v18, v16, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v19, v1, v19 :: v_dual_cndmask_b32 v18, v0, v18
+; GFX11-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
-; GFX11-NEXT:    v_or_b32_e32 v19, v17, v19
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v0, v18, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v28
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
+; GFX11-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
 ; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v1, v19, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v0, v24, v26
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v27
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v28
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v23, v23, v25
+; GFX11-NEXT:    v_or_b32_e32 v24, v24, v26
+; GFX11-NEXT:    v_dual_cndmask_b32 v25, 0, v1 :: v_dual_cndmask_b32 v16, v16, v23
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v17, v24, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v23, v19, v3, s0
 ; GFX11-NEXT:    v_and_b32_e32 v24, 0x7f, v20
-; GFX11-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v17, v17, v0, s0
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v23, v19, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v16, v8, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v17, v9, s1
-; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 64, v24
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v25, 0, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v10, v9, s1
+; GFX11-NEXT:    v_not_b32_e32 v16, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX11-NEXT:    v_or_b32_e32 v0, v21, v3
-; GFX11-NEXT:    v_not_b32_e32 v3, v20
 ; GFX11-NEXT:    v_or_b32_e32 v1, v22, v8
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT:    v_and_b32_e32 v22, 0x7f, v16
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v11, v[4:5]
 ; GFX11-NEXT:    v_lshlrev_b64 v[12:13], v24, v[6:7]
 ; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
-; GFX11-NEXT:    v_and_b32_e32 v22, 0x7f, v3
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
 ; GFX11-NEXT:    v_lshl_or_b32 v9, v14, 31, v9
 ; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]
-; GFX11-NEXT:    v_or_b32_e32 v12, v10, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX11-NEXT:    v_or_b32_e32 v5, v11, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0, v16, vcc_lo
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v20, 64, v22
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v22
+; GFX11-NEXT:    v_or_b32_e32 v12, v10, v12
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v22
 ; GFX11-NEXT:    v_lshrrev_b64 v[18:19], v22, v[8:9]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v22
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v3, v12, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
+; GFX11-NEXT:    v_or_b32_e32 v5, v11, v13
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[14:15]
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], v22, v[14:15]
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0, v16, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v22
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v24
 ; GFX11-NEXT:    v_or_b32_e32 v16, v18, v20
 ; GFX11-NEXT:    v_or_b32_e32 v18, v19, v21
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v3, v12 :: v_dual_cndmask_b32 v5, v4, v5
+; GFX11-NEXT:    v_lshrrev_b64 v[3:4], v22, v[14:15]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v24
 ; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, v18, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 8538dcabca924..58304d2072d7f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -347,49 +347,57 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ;
 ; GFX8-LABEL: s_fshr_i8:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_andn2_b32 s3, 7, s2
+; GFX8-NEXT:    s_and_b32 s2, s2, 7
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_and_b32 s3, s2, 7
-; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i8:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_andn2_b32 s3, 7, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 7
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_and_b32 s3, s2, 7
-; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX9-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i8:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_andn2_b32 s3, 7, s2
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_and_b32 s3, s2, 7
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_and_b32 s2, s2, 7
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i8:
 ; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_and_not1_b32 s3, 7, s2
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_and_b32 s3, s2, 7
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_b32 s2, s2, 7
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -414,33 +422,33 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX8-LABEL: v_fshr_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, 7, v2
-; GFX8-NEXT:    v_not_b32_e32 v2, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 7, v2
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_not_b32_e32 v3, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
@@ -451,9 +459,9 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX11-LABEL: v_fshr_i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_not_b32_e32 v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
@@ -687,25 +695,29 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ;
 ; GFX8-LABEL: s_fshr_v2i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 8
-; GFX8-NEXT:    s_and_b32 s6, s2, 7
-; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_andn2_b32 s6, 7, s2
+; GFX8-NEXT:    s_and_b32 s2, s2, 7
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s5
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX8-NEXT:    s_lshl_b32 s2, s3, s2
-; GFX8-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s5, 7
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s5, 7
+; GFX8-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_lshr_b32 s1, s3, s1
-; GFX8-NEXT:    s_or_b32 s1, s2, s1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
@@ -714,25 +726,29 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ;
 ; GFX9-LABEL: s_fshr_v2i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX9-NEXT:    s_lshr_b32 s5, s2, 8
-; GFX9-NEXT:    s_and_b32 s6, s2, 7
-; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_andn2_b32 s6, 7, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 7
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s5
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX9-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
-; GFX9-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s5, 7
+; GFX9-NEXT:    s_lshl_b32 s1, s3, 1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s5, 7
+; GFX9-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_lshr_b32 s1, s3, s1
-; GFX9-NEXT:    s_or_b32 s1, s2, s1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
@@ -741,24 +757,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ;
 ; GFX10-LABEL: s_fshr_v2i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX10-NEXT:    s_andn2_b32 s5, 7, s2
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
-; GFX10-NEXT:    s_and_b32 s6, s2, 7
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX10-NEXT:    s_andn2_b32 s5, 7, s6
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX10-NEXT:    s_and_b32 s6, s6, 7
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX10-NEXT:    s_and_b32 s2, s5, 7
-; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
+; GFX10-NEXT:    s_and_b32 s2, s2, 7
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, s5
-; GFX10-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    s_or_b32 s2, s3, s4
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
@@ -768,24 +788,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ;
 ; GFX11-LABEL: s_fshr_v2i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX11-NEXT:    s_and_not1_b32 s5, 7, s2
 ; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s2, 8
-; GFX11-NEXT:    s_and_b32 s6, s2, 7
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX11-NEXT:    s_and_not1_b32 s5, 7, s6
 ; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_and_b32 s6, s6, 7
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX11-NEXT:    s_and_b32 s2, s5, 7
-; GFX11-NEXT:    s_and_not1_b32 s5, 7, s5
+; GFX11-NEXT:    s_and_b32 s2, s2, 7
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, s5
-; GFX11-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX11-NEXT:    s_or_b32 s2, s3, s4
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
@@ -832,23 +856,23 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX8-LABEL: v_fshr_v2i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX8-NEXT:    v_not_b32_e32 v2, v2
+; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_not_b32_e32 v2, v5
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 7, v5
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
-; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v2, v3
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v2, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v5
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -857,23 +881,23 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX9-LABEL: v_fshr_v2i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
+; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_not_b32_e32 v2, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 7, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, v2, v3
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v5
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -885,20 +909,20 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX10-NEXT:    v_not_b32_e32 v2, v2
-; GFX10-NEXT:    v_not_b32_e32 v7, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v3
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX10-NEXT:    v_lshrrev_b16 v3, v3, v5
-; GFX10-NEXT:    v_lshlrev_b16 v4, v7, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v6, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
+; GFX10-NEXT:    v_lshlrev_b16 v4, v6, v4
+; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
+; GFX10-NEXT:    v_lshlrev_b16 v0, v7, v0
 ; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -912,22 +936,22 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v7, 7, v2
-; GFX11-NEXT:    v_not_b32_e32 v2, v2
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT:    v_xor_b32_e32 v6, -1, v3
 ; GFX11-NEXT:    v_lshlrev_b16 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX11-NEXT:    v_lshrrev_b16 v3, v3, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshlrev_b16 v4, v6, v4
-; GFX11-NEXT:    v_lshrrev_b16 v1, v7, v1
+; GFX11-NEXT:    v_lshrrev_b16 v1, v2, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b16 v0, v2, v0
+; GFX11-NEXT:    v_lshlrev_b16 v0, v7, v0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -997,50 +1021,58 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ;
 ; GFX8-LABEL: s_fshr_v4i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX8-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX8-NEXT:    s_and_b32 s12, s2, 7
-; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_andn2_b32 s12, 7, s2
+; GFX8-NEXT:    s_and_b32 s2, s2, 7
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s9
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s1, s1, s12
-; GFX8-NEXT:    s_lshl_b32 s2, s3, s2
-; GFX8-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s9, 7
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s9, 7
+; GFX8-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_lshr_b32 s1, s3, s1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX8-NEXT:    s_andn2_b32 s3, 7, s10
-; GFX8-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX8-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_lshl_b32 s2, s4, 1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX8-NEXT:    s_and_b32 s3, s10, 7
 ; GFX8-NEXT:    s_and_b32 s4, s7, 0xff
-; GFX8-NEXT:    s_or_b32 s1, s2, s1
-; GFX8-NEXT:    s_and_b32 s2, s10, 7
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_or_b32 s2, s3, s2
-; GFX8-NEXT:    s_and_b32 s3, s11, 7
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_lshr_b32 s3, s4, s3
 ; GFX8-NEXT:    s_andn2_b32 s4, 7, s11
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX8-NEXT:    s_or_b32 s2, s2, s3
+; GFX8-NEXT:    s_lshl_b32 s3, s5, 1
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX8-NEXT:    s_and_b32 s4, s11, 7
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX8-NEXT:    s_lshr_b32 s3, s8, s3
+; GFX8-NEXT:    s_lshr_b32 s4, s8, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX8-NEXT:    s_or_b32 s3, s4, s3
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
@@ -1050,50 +1082,58 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ;
 ; GFX9-LABEL: s_fshr_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX9-NEXT:    s_and_b32 s12, s2, 7
-; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_andn2_b32 s12, 7, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 7
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s9
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX9-NEXT:    s_lshr_b32 s1, s1, s12
-; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
-; GFX9-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s9, 7
+; GFX9-NEXT:    s_lshl_b32 s1, s3, 1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s9, 7
+; GFX9-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_lshr_b32 s1, s3, s1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX9-NEXT:    s_andn2_b32 s3, 7, s10
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX9-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_lshl_b32 s2, s4, 1
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX9-NEXT:    s_and_b32 s3, s10, 7
 ; GFX9-NEXT:    s_and_b32 s4, s7, 0xff
-; GFX9-NEXT:    s_or_b32 s1, s2, s1
-; GFX9-NEXT:    s_and_b32 s2, s10, 7
 ; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX9-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_or_b32 s2, s3, s2
-; GFX9-NEXT:    s_and_b32 s3, s11, 7
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s4, s3
 ; GFX9-NEXT:    s_andn2_b32 s4, 7, s11
-; GFX9-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX9-NEXT:    s_or_b32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s3, s5, 1
+; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX9-NEXT:    s_and_b32 s4, s11, 7
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX9-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX9-NEXT:    s_lshr_b32 s3, s8, s3
+; GFX9-NEXT:    s_lshr_b32 s4, s8, s4
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX9-NEXT:    s_or_b32 s3, s4, s3
+; GFX9-NEXT:    s_or_b32 s3, s3, s4
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s3, 0xff
@@ -1104,43 +1144,51 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-LABEL: s_fshr_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX10-NEXT:    s_and_b32 s12, s2, 7
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_andn2_b32 s12, 7, s2
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX10-NEXT:    s_and_b32 s2, s2, 7
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX10-NEXT:    s_and_b32 s2, s9, 7
-; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s9
+; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX10-NEXT:    s_and_b32 s9, s9, 7
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s12
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s9
-; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX10-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX10-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX10-NEXT:    s_lshr_b32 s3, s6, s9
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_or_b32 s1, s3, s2
-; GFX10-NEXT:    s_and_b32 s2, s10, 7
-; GFX10-NEXT:    s_andn2_b32 s3, 7, s10
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX10-NEXT:    s_or_b32 s1, s2, s3
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s10
+; GFX10-NEXT:    s_lshl_b32 s3, s4, 1
+; GFX10-NEXT:    s_and_b32 s4, s7, 0xff
+; GFX10-NEXT:    s_and_b32 s6, s10, 7
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX10-NEXT:    s_andn2_b32 s4, 7, s11
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX10-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX10-NEXT:    s_lshr_b32 s3, s4, s6
+; GFX10-NEXT:    s_lshl_b32 s4, s5, 1
+; GFX10-NEXT:    s_andn2_b32 s5, 7, s11
 ; GFX10-NEXT:    s_and_b32 s6, s11, 7
-; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_lshl_b32 s4, s4, s5
 ; GFX10-NEXT:    s_lshr_b32 s5, s8, s6
-; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_or_b32 s2, s2, s3
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_or_b32 s3, s4, s5
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
@@ -1157,43 +1205,51 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX11-LABEL: s_fshr_v4i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX11-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX11-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX11-NEXT:    s_and_b32 s12, s2, 7
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_not1_b32 s12, 7, s2
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX11-NEXT:    s_and_b32 s2, s2, 7
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX11-NEXT:    s_and_b32 s2, s9, 7
-; GFX11-NEXT:    s_and_not1_b32 s9, 7, s9
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX11-NEXT:    s_and_not1_b32 s2, 7, s9
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX11-NEXT:    s_and_b32 s9, s9, 7
+; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s12
-; GFX11-NEXT:    s_lshl_b32 s3, s3, s9
-; GFX11-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX11-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX11-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX11-NEXT:    s_lshr_b32 s3, s6, s9
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-NEXT:    s_and_b32 s2, s10, 7
-; GFX11-NEXT:    s_and_not1_b32 s3, 7, s10
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX11-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-NEXT:    s_and_not1_b32 s2, 7, s10
+; GFX11-NEXT:    s_lshl_b32 s3, s4, 1
+; GFX11-NEXT:    s_and_b32 s4, s7, 0xff
+; GFX11-NEXT:    s_and_b32 s6, s10, 7
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX11-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX11-NEXT:    s_and_not1_b32 s4, 7, s11
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX11-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX11-NEXT:    s_lshr_b32 s3, s4, s6
+; GFX11-NEXT:    s_lshl_b32 s4, s5, 1
+; GFX11-NEXT:    s_and_not1_b32 s5, 7, s11
 ; GFX11-NEXT:    s_and_b32 s6, s11, 7
-; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_lshl_b32 s4, s4, s5
 ; GFX11-NEXT:    s_lshr_b32 s5, s8, s6
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX11-NEXT:    s_or_b32 s3, s4, s5
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
@@ -1272,40 +1328,41 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-LABEL: v_fshr_v4i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_not_b32_e32 v7, v2
-; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v7, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX8-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX8-NEXT:    v_and_b32_e32 v7, 7, v5
-; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v5, v3
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v7, v3
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v7, -1
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, 7
+; GFX8-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8-NEXT:    v_xor_b32_sdwa v9, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v9, 1
-; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v5, v9, v5
+; GFX8-NEXT:    v_mov_b32_e32 v9, 7
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_xor_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT:    v_lshlrev_b16_e32 v7, v7, v10
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v8
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    v_and_b32_e32 v4, 7, v4
+; GFX8-NEXT:    v_and_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b16_e32 v8, v10, v8
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v8
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -1321,40 +1378,41 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-LABEL: v_fshr_v4i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_not_b32_e32 v7, v2
-; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 1, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v7, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX9-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 7, v5
-; GFX9-NEXT:    v_not_b32_e32 v5, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v5, v3
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v7, v3
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_mov_b32_e32 v7, -1
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 7
-; GFX9-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mov_b32_e32 v9, 1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 1
+; GFX9-NEXT:    v_xor_b32_sdwa v9, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v10
-; GFX9-NEXT:    v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_lshrrev_b16_e32 v5, v5, v10
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v5, v9, v5
+; GFX9-NEXT:    v_mov_b32_e32 v9, 7
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_xor_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v11, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 7, v4
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b16_e32 v10, v10, v11
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v10
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -1371,45 +1429,46 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX10-NEXT:    v_not_b32_e32 v8, v2
+; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; GFX10-NEXT:    v_not_b32_e32 v10, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v3, 7
 ; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX10-NEXT:    v_mov_b32_e32 v14, 0xff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
-; GFX10-NEXT:    v_not_b32_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_not_b32_sdwa v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xff, v9
 ; GFX10-NEXT:    v_lshlrev_b16 v4, v10, v4
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0xff
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
+; GFX10-NEXT:    v_xor_b32_sdwa v9, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mov_b32_e32 v10, 7
+; GFX10-NEXT:    v_xor_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_e32 v12, 7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX10-NEXT:    v_and_b32_sdwa v15, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v14
+; GFX10-NEXT:    v_and_b32_e32 v9, 7, v9
+; GFX10-NEXT:    v_and_b32_sdwa v15, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b16 v7, 1, v7
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v9
-; GFX10-NEXT:    v_lshlrev_b16 v5, v8, v6
+; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b16 v5, v5, v8
+; GFX10-NEXT:    v_lshlrev_b16 v6, v9, v6
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v15, v1
-; GFX10-NEXT:    v_lshlrev_b16 v6, v10, v7
+; GFX10-NEXT:    v_lshlrev_b16 v3, v3, v7
 ; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v11
 ; GFX10-NEXT:    v_lshrrev_b16 v7, v12, v13
-; GFX10-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, 8
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
@@ -1427,29 +1486,29 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_not_b32_e32 v12, v7
+; GFX11-NEXT:    v_xor_b32_e32 v12, -1, v7
 ; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v12, 7, v12
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 1, v3
-; GFX11-NEXT:    v_not_b32_e32 v14, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 7, v12
+; GFX11-NEXT:    v_xor_b32_e32 v14, -1, v11
 ; GFX11-NEXT:    v_lshrrev_b16 v6, v7, v6
-; GFX11-NEXT:    v_not_b32_e32 v7, v13
+; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v13
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX11-NEXT:    v_not_b32_e32 v10, v2
+; GFX11-NEXT:    v_xor_b32_e32 v10, -1, v2
 ; GFX11-NEXT:    v_lshlrev_b16 v3, v12, v3
-; GFX11-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 7, v14
 ; GFX11-NEXT:    v_lshlrev_b16 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v12, 7, v14
+; GFX11-NEXT:    v_and_b32_e32 v11, 7, v11
 ; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX11-NEXT:    v_and_b32_e32 v13, 7, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX11-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX11-NEXT:    v_lshlrev_b16 v4, v12, v4
@@ -5112,51 +5171,46 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 define amdgpu_ps i64 @s_fshr_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) {
 ; GFX6-LABEL: s_fshr_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 63
-; GFX6-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
-; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
+; GFX6-NEXT:    s_not_b32 s5, s4
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 63
-; GFX8-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
-; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
+; GFX8-NEXT:    s_not_b32 s5, s4
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 63
-; GFX9-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
-; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
+; GFX9-NEXT:    s_not_b32 s5, s4
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_andn2_b64 s[6:7], 63, s[4:5]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], 63
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s6
+; GFX10-NEXT:    s_not_b32 s5, s4
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_not1_b64 s[6:7], 63, s[4:5]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_and_b64 s[4:5], s[4:5], 63
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s6
+; GFX11-NEXT:    s_not_b32 s5, s4
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -5233,12 +5287,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX6-LABEL: v_fshr_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX6-NEXT:    v_not_b32_e32 v4, v4
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    v_not_b32_e32 v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v5, 63, v5
 ; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v5
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v5
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -5246,12 +5300,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX8-LABEL: v_fshr_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    v_not_b32_e32 v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 63, v5
 ; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -5259,12 +5313,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX9-LABEL: v_fshr_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX9-NEXT:    v_not_b32_e32 v4, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    v_not_b32_e32 v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 63, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -5410,38 +5464,38 @@ define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) {
 define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) {
 ; GFX6-LABEL: v_fshr_i64_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX6-NEXT:    v_not_b32_e32 v0, v0
-; GFX6-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT:    v_not_b32_e32 v1, v0
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v0
-; GFX6-NEXT:    v_lshr_b64 v[2:3], s[2:3], v2
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 63, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT:    v_lshl_b64 v[1:2], s[0:1], v1
+; GFX6-NEXT:    v_lshr_b64 v[3:4], s[2:3], v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i64_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT:    v_not_b32_e32 v1, v0
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v2, s[2:3]
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 63, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v1, s[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[3:4], v0, s[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i64_ssv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT:    v_not_b32_e32 v1, v0
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v2, s[2:3]
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 63, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v1, s[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[3:4], v0, s[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX9-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i64_ssv:
@@ -5478,43 +5532,43 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
 define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i64_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX6-NEXT:    s_not_b32 s3, s2
+; GFX6-NEXT:    s_and_b32 s2, s2, 63
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], s4
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], s2
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i64_svs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX8-NEXT:    s_not_b32 s3, s2
+; GFX8-NEXT:    s_and_b32 s2, s2, 63
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s2, v[0:1]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i64_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX9-NEXT:    s_not_b32 s3, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 63
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s2, v[0:1]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i64_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX10-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    s_and_b32 s3, s2, 63
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX10-NEXT:    s_not_b32 s2, s2
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -5522,10 +5576,10 @@ define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ;
 ; GFX11-LABEL: v_fshr_i64_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX11-NEXT:    s_and_not1_b64 s[2:3], 63, s[2:3]
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX11-NEXT:    s_and_b32 s3, s2, 63
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX11-NEXT:    s_not_b32 s2, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -5542,10 +5596,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX6-LABEL: v_fshr_i64_vss:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s2
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s3
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -5553,10 +5606,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX8-LABEL: v_fshr_i64_vss:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s2, v[0:1]
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -5564,10 +5616,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX9-LABEL: v_fshr_i64_vss:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s2, v[0:1]
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -5575,10 +5626,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX10-LABEL: v_fshr_i64_vss:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 63, s[2:3]
-; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], 63
+; GFX10-NEXT:    s_andn2_b32 s3, 63, s2
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -5586,13 +5636,12 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX11-LABEL: v_fshr_i64_vss:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 63, s[2:3]
-; GFX11-NEXT:    s_and_b64 s[2:3], s[2:3], 63
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_and_not1_b32 s3, 63, s2
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5603,63 +5652,55 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX6-NEXT:    s_not_b32 s9, s8
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; GFX6-NEXT:    s_not_b32 s4, s10
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], s10
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX8-NEXT:    s_not_b32 s9, s8
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; GFX8-NEXT:    s_not_b32 s4, s10
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], s10
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX9-NEXT:    s_not_b32 s9, s8
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; GFX9-NEXT:    s_not_b32 s4, s10
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], s10
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_andn2_b64 s[12:13], 63, s[8:9]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[8:9], 63
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
-; GFX10-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX10-NEXT:    s_not_b32 s9, s8
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_and_b64 s[10:11], s[10:11], 63
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX10-NEXT:    s_not_b32 s9, s10
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], s9
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
@@ -5667,15 +5708,13 @@ define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %
 ;
 ; GFX11-LABEL: s_fshr_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_not1_b64 s[12:13], 63, s[8:9]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[8:9], 63
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], 63, s[10:11]
+; GFX11-NEXT:    s_not_b32 s9, s8
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_and_b64 s[10:11], s[10:11], 63
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX11-NEXT:    s_not_b32 s9, s10
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s9
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
@@ -5688,18 +5727,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX6-LABEL: v_fshr_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    v_not_b32_e32 v9, v8
+; GFX6-NEXT:    v_and_b32_e32 v9, 63, v9
 ; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v9
-; GFX6-NEXT:    v_not_b32_e32 v8, v10
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v9
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX6-NEXT:    v_not_b32_e32 v4, v10
+; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], v8
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5709,18 +5748,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX8-LABEL: v_fshr_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    v_not_b32_e32 v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 63, v9
 ; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
-; GFX8-NEXT:    v_not_b32_e32 v8, v10
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_not_b32_e32 v4, v10
+; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
 ; GFX8-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5730,18 +5769,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX9-LABEL: v_fshr_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    v_not_b32_e32 v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 63, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
-; GFX9-NEXT:    v_not_b32_e32 v8, v10
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX9-NEXT:    v_not_b32_e32 v4, v10
+; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
 ; GFX9-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5800,231 +5839,237 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: s_fshr_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_sub_i32 s11, s8, 64
-; GFX6-NEXT:    s_sub_i32 s9, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX6-NEXT:    s_not_b32 s9, s8
+; GFX6-NEXT:    s_sub_i32 s16, s2, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s2
+; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX6-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
+; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX6-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    s_sub_i32 s14, s10, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s10
-; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
+; GFX6-NEXT:    s_and_b32 s0, s8, 0x7f
+; GFX6-NEXT:    s_sub_i32 s14, s0, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_sub_i32 s11, s8, 64
-; GFX8-NEXT:    s_sub_i32 s9, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX8-NEXT:    s_not_b32 s9, s8
+; GFX8-NEXT:    s_sub_i32 s16, s2, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s2
+; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX8-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX8-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX8-NEXT:    s_sub_i32 s14, s10, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s10
-; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
+; GFX8-NEXT:    s_and_b32 s0, s8, 0x7f
+; GFX8-NEXT:    s_sub_i32 s14, s0, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_sub_i32 s11, s8, 64
-; GFX9-NEXT:    s_sub_i32 s9, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX9-NEXT:    s_not_b32 s9, s8
+; GFX9-NEXT:    s_sub_i32 s16, s2, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s2
+; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX9-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX9-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX9-NEXT:    s_sub_i32 s14, s10, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s10
-; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
+; GFX9-NEXT:    s_and_b32 s0, s8, 0x7f
+; GFX9-NEXT:    s_sub_i32 s14, s0, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s12, s1, 31
-; GFX10-NEXT:    s_mov_b32 s13, 0
+; GFX10-NEXT:    s_lshr_b32 s10, s1, 31
+; GFX10-NEXT:    s_mov_b32 s11, 0
+; GFX10-NEXT:    s_andn2_b32 s9, 0x7f, s8
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[12:13]
-; GFX10-NEXT:    s_sub_i32 s11, s8, 64
-; GFX10-NEXT:    s_sub_i32 s9, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT:    s_not_b32 s14, s8
+; GFX10-NEXT:    s_sub_i32 s16, s9, 64
+; GFX10-NEXT:    s_sub_i32 s10, 64, s9
+; GFX10-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
-; GFX10-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[2:3], s14
+; GFX10-NEXT:    s_lshl_b64 s[14:15], s[0:1], s14
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s16
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b64 s[12:13], s[14:15], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s14, s10, 64
-; GFX10-NEXT:    s_sub_i32 s11, 64, s10
-; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX10-NEXT:    s_and_b32 s0, s8, 0x7f
+; GFX10-NEXT:    s_sub_i32 s14, s0, 64
+; GFX10-NEXT:    s_sub_i32 s9, 64, s0
+; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
-; GFX10-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s8
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s9
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
-; GFX10-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], 0x7f, s[8:9]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s12, s1, 31
-; GFX11-NEXT:    s_mov_b32 s13, 0
+; GFX11-NEXT:    s_lshr_b32 s10, s1, 31
+; GFX11-NEXT:    s_mov_b32 s11, 0
+; GFX11-NEXT:    s_and_not1_b32 s9, 0x7f, s8
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[12:13]
-; GFX11-NEXT:    s_sub_i32 s11, s8, 64
-; GFX11-NEXT:    s_sub_i32 s9, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX11-NEXT:    s_not_b32 s14, s8
+; GFX11-NEXT:    s_sub_i32 s16, s9, 64
+; GFX11-NEXT:    s_sub_i32 s10, 64, s9
+; GFX11-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
-; GFX11-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX11-NEXT:    s_lshl_b64 s[12:13], s[2:3], s14
+; GFX11-NEXT:    s_lshl_b64 s[14:15], s[0:1], s14
+; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s16
 ; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX11-NEXT:    s_cselect_b64 s[12:13], s[14:15], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s14, s10, 64
-; GFX11-NEXT:    s_sub_i32 s11, 64, s10
-; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX11-NEXT:    s_and_b32 s0, s8, 0x7f
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s14, s0, 64
+; GFX11-NEXT:    s_sub_i32 s9, 64, s0
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
-; GFX11-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], s8
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s9
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
-; GFX11-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX11-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6035,29 +6080,29 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshr_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], 1
+; GFX6-NEXT:    v_lshl_b64 v[9:10], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX6-NEXT:    v_not_b32_e32 v0, v8
+; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v15
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v0
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v15
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[9:10], v0
+; GFX6-NEXT:    v_lshl_b64 v[11:12], v[2:3], v15
 ; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v15
-; GFX6-NEXT:    v_lshl_b64 v[12:13], v[8:9], v15
-; GFX6-NEXT:    v_or_b32_e32 v10, v0, v10
-; GFX6-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[8:9], v16
+; GFX6-NEXT:    v_lshl_b64 v[13:14], v[9:10], v15
+; GFX6-NEXT:    v_or_b32_e32 v11, v0, v11
+; GFX6-NEXT:    v_or_b32_e32 v12, v1, v12
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[9:10], v16
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v13, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v13, 0, v14, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
+; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX6-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v12, v1, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v14
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], v14
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[6:7], v2
@@ -6074,38 +6119,38 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v10, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v12, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[9:10], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX8-NEXT:    v_not_b32_e32 v0, v8
+; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v15
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v15, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[9:10]
+; GFX8-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
 ; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v15
-; GFX8-NEXT:    v_lshlrev_b64 v[12:13], v15, v[8:9]
-; GFX8-NEXT:    v_or_b32_e32 v10, v0, v10
-; GFX8-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[13:14], v15, v[9:10]
+; GFX8-NEXT:    v_or_b32_e32 v11, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v12, v1, v12
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[9:10]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, 0, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
+; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v1, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v14
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
@@ -6122,39 +6167,39 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v10, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v12, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[9:10], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX9-NEXT:    v_not_b32_e32 v0, v8
+; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v15
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v15, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[9:10]
+; GFX9-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
 ; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v15
-; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v15, v[8:9]
-; GFX9-NEXT:    v_or_b32_e32 v10, v0, v10
-; GFX9-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v16, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[13:14], v15, v[9:10]
+; GFX9-NEXT:    v_or_b32_e32 v11, v0, v11
+; GFX9-NEXT:    v_or_b32_e32 v12, v1, v12
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v16, v[9:10]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, 0, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v1, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
 ; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
@@ -6170,10 +6215,10 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v10, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX9-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v12, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i128:
@@ -6282,158 +6327,158 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshr_i128_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX6-NEXT:    v_not_b32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s9, 0
-; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX6-NEXT:    v_not_b32_e32 v1, v0
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v7
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[10:11], v0
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[0:1], v7
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX6-NEXT:    s_mov_b32 s1, 0
+; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v1
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 64, v7
+; GFX6-NEXT:    v_lshr_b64 v[1:2], s[8:9], v1
+; GFX6-NEXT:    v_lshl_b64 v[3:4], s[0:1], v7
 ; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v7
-; GFX6-NEXT:    v_lshl_b64 v[4:5], s[10:11], v7
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_lshl_b64 v[5:6], s[8:9], v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[10:11], v8
+; GFX6-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX6-NEXT:    v_lshl_b64 v[1:2], s[8:9], v8
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v3, s0
+; GFX6-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v6
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v6
+; GFX6-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v10
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v10
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], s[6:7], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v6
+; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v10
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[6:7], v11
-; GFX6-NEXT:    v_lshr_b64 v[4:5], s[6:7], v6
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX6-NEXT:    v_lshr_b64 v[4:5], s[6:7], v10
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    s_mov_b32 s9, 0
-; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX8-NEXT:    v_not_b32_e32 v1, v0
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v7
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[10:11]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v1
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 64, v7
+; GFX8-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
 ; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v7, s[10:11]
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[10:11]
+; GFX8-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, s[8:9]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v6
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v6, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v10
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v10, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v6
+; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v10
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v10, s[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_ssv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s9, 0
-; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX9-NEXT:    v_not_b32_e32 v1, v0
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v7
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[10:11]
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
 ; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v7, s[10:11]
-; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[10:11]
+; GFX9-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, s[8:9]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v6, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v10, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v10
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v10, s[6:7]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX9-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX9-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_ssv:
@@ -6543,40 +6588,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i128_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_sub_i32 s7, s4, 64
-; GFX6-NEXT:    s_sub_i32 s5, 64, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX6-NEXT:    s_not_b32 s5, s4
+; GFX6-NEXT:    s_sub_i32 s12, s2, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s2
+; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
-; GFX6-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX6-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s0, s6, 64
-; GFX6-NEXT:    s_sub_i32 s1, 64, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s6
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX6-NEXT:    s_and_b32 s0, s4, 0x7f
+; GFX6-NEXT:    s_sub_i32 s1, s0, 64
+; GFX6-NEXT:    s_sub_i32 s4, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s0
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
 ; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s6
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s7
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s0
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s1
+; GFX6-NEXT:    s_and_b32 s0, 1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
@@ -6590,46 +6636,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
 ; GFX6-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, s6, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_svs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_sub_i32 s7, s4, 64
-; GFX8-NEXT:    s_sub_i32 s5, 64, s4
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX8-NEXT:    s_not_b32 s5, s4
+; GFX8-NEXT:    s_sub_i32 s12, s2, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s2
+; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
-; GFX8-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX8-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s0, s6, 64
-; GFX8-NEXT:    s_sub_i32 s1, 64, s6
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT:    s_and_b32 s0, s4, 0x7f
+; GFX8-NEXT:    s_sub_i32 s1, s0, 64
+; GFX8-NEXT:    s_sub_i32 s4, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX8-NEXT:    s_and_b32 s0, 1, s7
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
+; GFX8-NEXT:    s_and_b32 s0, 1, s5
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
@@ -6643,46 +6690,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_sub_i32 s7, s4, 64
-; GFX9-NEXT:    s_sub_i32 s5, 64, s4
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX9-NEXT:    s_not_b32 s5, s4
+; GFX9-NEXT:    s_sub_i32 s12, s2, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s2
+; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX9-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s0, s6, 64
-; GFX9-NEXT:    s_sub_i32 s1, 64, s6
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX9-NEXT:    s_and_b32 s0, s4, 0x7f
+; GFX9-NEXT:    s_sub_i32 s1, s0, 64
+; GFX9-NEXT:    s_sub_i32 s4, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX9-NEXT:    s_and_b32 s0, 1, s7
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
+; GFX9-NEXT:    s_and_b32 s0, 1, s5
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
@@ -6696,50 +6744,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
 ; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX9-NEXT:    v_or_b32_e32 v2, s6, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:    s_lshr_b32 s6, s1, 31
+; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_sub_i32 s7, s4, 64
-; GFX10-NEXT:    s_sub_i32 s5, 64, s4
-; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT:    s_not_b32 s10, s4
+; GFX10-NEXT:    s_sub_i32 s12, s5, 64
+; GFX10-NEXT:    s_sub_i32 s6, 64, s5
+; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s10
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s0, 64, s6
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT:    s_sub_i32 s0, s6, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    s_and_b32 s0, s4, 0x7f
+; GFX10-NEXT:    s_sub_i32 s1, 64, s0
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX10-NEXT:    s_sub_i32 s1, s0, 64
+; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_and_b32 s1, 1, s4
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s7
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX10-NEXT:    s_and_b32 s0, 1, s5
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
@@ -6749,64 +6798,65 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX10-NEXT:    v_or_b32_e32 v0, s4, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, s5, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshr_i128_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX11-NEXT:    s_mov_b32 s9, 0
+; GFX11-NEXT:    s_lshr_b32 s6, s1, 31
+; GFX11-NEXT:    s_mov_b32 s7, 0
+; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_sub_i32 s7, s4, 64
-; GFX11-NEXT:    s_sub_i32 s5, 64, s4
-; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX11-NEXT:    s_not_b32 s10, s4
+; GFX11-NEXT:    s_sub_i32 s12, s5, 64
+; GFX11-NEXT:    s_sub_i32 s6, 64, s5
+; GFX11-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s10
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s0, 64, s6
+; GFX11-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT:    s_sub_i32 s0, s6, 64
-; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    s_sub_i32 s1, 64, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX11-NEXT:    s_sub_i32 s1, s0, 64
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX11-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX11-NEXT:    s_and_b32 s0, 1, s1
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_and_b32 s1, 1, s4
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s0, 1, s7
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX11-NEXT:    s_and_b32 s0, 1, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, s4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, s5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6816,51 +6866,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i128_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    s_sub_i32 s5, s4, 64
-; GFX6-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX6-NEXT:    s_sub_i32 s6, s5, 64
+; GFX6-NEXT:    s_sub_i32 s7, 64, s5
+; GFX6-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], s7
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s5
 ; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], s7
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], s4
-; GFX6-NEXT:    s_and_b32 s4, 1, s8
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX6-NEXT:    s_and_b32 s4, 1, s9
-; GFX6-NEXT:    s_sub_i32 s10, s6, 64
-; GFX6-NEXT:    s_sub_i32 s8, 64, s6
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], s5
 ; GFX6-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX6-NEXT:    v_or_b32_e32 v7, v1, v7
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[4:5], s5
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[4:5], s6
+; GFX6-NEXT:    s_and_b32 s5, 1, s8
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    s_and_b32 s5, 1, s9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX6-NEXT:    s_sub_i32 s10, s5, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s5
+; GFX6-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6869,51 +6919,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX8-LABEL: v_fshr_i128_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    s_sub_i32 s5, s4, 64
-; GFX8-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX8-NEXT:    s_sub_i32 s6, s5, 64
+; GFX8-NEXT:    s_sub_i32 s7, 64, s5
+; GFX8-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX8-NEXT:    s_and_b32 s4, 1, s8
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT:    s_and_b32 s4, 1, s9
-; GFX8-NEXT:    s_sub_i32 s10, s6, 64
-; GFX8-NEXT:    s_sub_i32 s8, 64, s6
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s5, v[4:5]
 ; GFX8-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, v1, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s6, v[4:5]
+; GFX8-NEXT:    s_and_b32 s5, 1, s8
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    s_and_b32 s5, 1, s9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX8-NEXT:    s_sub_i32 s10, s5, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s5
+; GFX8-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v4
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6922,51 +6972,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX9-LABEL: v_fshr_i128_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    s_sub_i32 s5, s4, 64
-; GFX9-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX9-NEXT:    s_sub_i32 s6, s5, 64
+; GFX9-NEXT:    s_sub_i32 s7, 64, s5
+; GFX9-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX9-NEXT:    s_and_b32 s4, 1, s8
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT:    s_and_b32 s4, 1, s9
-; GFX9-NEXT:    s_sub_i32 s10, s6, 64
-; GFX9-NEXT:    s_sub_i32 s8, 64, s6
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s5, v[4:5]
 ; GFX9-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v1, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s6, v[4:5]
+; GFX9-NEXT:    s_and_b32 s5, 1, s8
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    s_and_b32 s5, 1, s9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX9-NEXT:    s_sub_i32 s10, s5, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s5
+; GFX9-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v4
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6978,49 +7028,49 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT:    s_sub_i32 s7, 64, s4
+; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
+; GFX10-NEXT:    s_sub_i32 s6, s5, 64
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX10-NEXT:    s_sub_i32 s5, s4, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s5
+; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_and_b32 s4, 1, s8
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX10-NEXT:    s_and_b32 s5, 1, s8
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s6, v[0:1]
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX10-NEXT:    s_and_b32 s5, s4, 0x7f
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    s_and_b32 s4, 1, s9
-; GFX10-NEXT:    s_sub_i32 s10, s6, 64
-; GFX10-NEXT:    s_sub_i32 s7, 64, s6
-; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    s_and_b32 s6, 1, s9
+; GFX10-NEXT:    s_sub_i32 s10, s5, 64
+; GFX10-NEXT:    s_sub_i32 s8, 64, s5
+; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -7031,47 +7081,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s7, 64, s4
+; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_sub_i32 s6, s5, 64
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX11-NEXT:    s_sub_i32 s5, s4, 64
-; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s5
+; GFX11-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
 ; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    s_and_b32 s4, 1, s8
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX11-NEXT:    s_and_b32 s5, 1, s8
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s6, v[0:1]
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX11-NEXT:    s_and_b32 s5, s4, 0x7f
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    s_and_b32 s4, 1, s9
-; GFX11-NEXT:    s_sub_i32 s10, s6, 64
-; GFX11-NEXT:    s_sub_i32 s7, 64, s6
-; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX11-NEXT:    s_and_b32 s6, 1, s9
+; GFX11-NEXT:    s_sub_i32 s10, s5, 64
+; GFX11-NEXT:    s_sub_i32 s8, 64, s5
+; GFX11-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
 ; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -7209,435 +7259,447 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s24, s1, 31
-; GFX6-NEXT:    s_mov_b32 s25, 0
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
-; GFX6-NEXT:    s_sub_i32 s19, s16, 64
-; GFX6-NEXT:    s_sub_i32 s17, 64, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX6-NEXT:    s_lshr_b32 s22, s1, 31
+; GFX6-NEXT:    s_mov_b32 s23, 0
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
+; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX6-NEXT:    s_not_b32 s17, s16
+; GFX6-NEXT:    s_sub_i32 s21, s2, 64
+; GFX6-NEXT:    s_sub_i32 s22, 64, s2
+; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
-; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
+; GFX6-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
+; GFX6-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
 ; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX6-NEXT:    s_sub_i32 s24, s18, 64
-; GFX6-NEXT:    s_sub_i32 s22, 64, s18
-; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
+; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX6-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
+; GFX6-NEXT:    s_and_b32 s0, s16, 0x7f
+; GFX6-NEXT:    s_sub_i32 s21, s0, 64
+; GFX6-NEXT:    s_sub_i32 s22, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX6-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX6-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
 ; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX6-NEXT:    s_lshr_b32 s24, s5, 31
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
-; GFX6-NEXT:    s_sub_i32 s9, s10, 64
-; GFX6-NEXT:    s_sub_i32 s11, 64, s10
-; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX6-NEXT:    s_lshr_b32 s22, s5, 31
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
+; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
+; GFX6-NEXT:    s_andn2_b32 s6, 0x7f, s20
+; GFX6-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX6-NEXT:    s_not_b32 s16, s20
+; GFX6-NEXT:    s_sub_i32 s18, s6, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s6
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX6-NEXT:    s_sub_i32 s18, s8, 64
-; GFX6-NEXT:    s_sub_i32 s16, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
+; GFX6-NEXT:    s_and_b32 s4, s20, 0x7f
+; GFX6-NEXT:    s_sub_i32 s18, s4, 64
+; GFX6-NEXT:    s_sub_i32 s16, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[14:15]
+; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX6-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s24, s1, 31
-; GFX8-NEXT:    s_mov_b32 s25, 0
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
-; GFX8-NEXT:    s_sub_i32 s19, s16, 64
-; GFX8-NEXT:    s_sub_i32 s17, 64, s16
-; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX8-NEXT:    s_lshr_b32 s22, s1, 31
+; GFX8-NEXT:    s_mov_b32 s23, 0
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
+; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX8-NEXT:    s_not_b32 s17, s16
+; GFX8-NEXT:    s_sub_i32 s21, s2, 64
+; GFX8-NEXT:    s_sub_i32 s22, 64, s2
+; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
-; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
+; GFX8-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
+; GFX8-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
 ; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX8-NEXT:    s_sub_i32 s24, s18, 64
-; GFX8-NEXT:    s_sub_i32 s22, 64, s18
-; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
+; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
+; GFX8-NEXT:    s_and_b32 s0, s16, 0x7f
+; GFX8-NEXT:    s_sub_i32 s21, s0, 64
+; GFX8-NEXT:    s_sub_i32 s22, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX8-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX8-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
 ; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX8-NEXT:    s_lshr_b32 s24, s5, 31
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
-; GFX8-NEXT:    s_sub_i32 s9, s10, 64
-; GFX8-NEXT:    s_sub_i32 s11, 64, s10
-; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX8-NEXT:    s_lshr_b32 s22, s5, 31
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
+; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
+; GFX8-NEXT:    s_andn2_b32 s6, 0x7f, s20
+; GFX8-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX8-NEXT:    s_not_b32 s16, s20
+; GFX8-NEXT:    s_sub_i32 s18, s6, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s6
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX8-NEXT:    s_sub_i32 s18, s8, 64
-; GFX8-NEXT:    s_sub_i32 s16, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
+; GFX8-NEXT:    s_and_b32 s4, s20, 0x7f
+; GFX8-NEXT:    s_sub_i32 s18, s4, 64
+; GFX8-NEXT:    s_sub_i32 s16, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[14:15]
+; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX8-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX8-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s24, s1, 31
-; GFX9-NEXT:    s_mov_b32 s25, 0
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
-; GFX9-NEXT:    s_sub_i32 s19, s16, 64
-; GFX9-NEXT:    s_sub_i32 s17, 64, s16
-; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX9-NEXT:    s_lshr_b32 s22, s1, 31
+; GFX9-NEXT:    s_mov_b32 s23, 0
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
+; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX9-NEXT:    s_not_b32 s17, s16
+; GFX9-NEXT:    s_sub_i32 s21, s2, 64
+; GFX9-NEXT:    s_sub_i32 s22, 64, s2
+; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
-; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
+; GFX9-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
+; GFX9-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
 ; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX9-NEXT:    s_sub_i32 s24, s18, 64
-; GFX9-NEXT:    s_sub_i32 s22, 64, s18
-; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
+; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX9-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
+; GFX9-NEXT:    s_and_b32 s0, s16, 0x7f
+; GFX9-NEXT:    s_sub_i32 s21, s0, 64
+; GFX9-NEXT:    s_sub_i32 s22, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX9-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX9-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
 ; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX9-NEXT:    s_lshr_b32 s24, s5, 31
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
-; GFX9-NEXT:    s_sub_i32 s9, s10, 64
-; GFX9-NEXT:    s_sub_i32 s11, 64, s10
-; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX9-NEXT:    s_lshr_b32 s22, s5, 31
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
+; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
+; GFX9-NEXT:    s_andn2_b32 s6, 0x7f, s20
+; GFX9-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX9-NEXT:    s_not_b32 s16, s20
+; GFX9-NEXT:    s_sub_i32 s18, s6, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s6
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX9-NEXT:    s_sub_i32 s18, s8, 64
-; GFX9-NEXT:    s_sub_i32 s16, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
+; GFX9-NEXT:    s_and_b32 s4, s20, 0x7f
+; GFX9-NEXT:    s_sub_i32 s18, s4, 64
+; GFX9-NEXT:    s_sub_i32 s16, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[14:15]
+; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX9-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX9-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX10-NEXT:    s_mov_b32 s23, 0
+; GFX10-NEXT:    s_lshr_b32 s18, s1, 31
+; GFX10-NEXT:    s_mov_b32 s19, 0
+; GFX10-NEXT:    s_andn2_b32 s17, 0x7f, s16
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s19, s16, 64
-; GFX10-NEXT:    s_sub_i32 s17, 64, s16
-; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
+; GFX10-NEXT:    s_not_b32 s18, s16
+; GFX10-NEXT:    s_sub_i32 s21, s17, 64
+; GFX10-NEXT:    s_sub_i32 s22, 64, s17
+; GFX10-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
-; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX10-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
+; GFX10-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
 ; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s22, s18, 64
-; GFX10-NEXT:    s_sub_i32 s19, 64, s18
-; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX10-NEXT:    s_and_b32 s0, s16, 0x7f
+; GFX10-NEXT:    s_sub_i32 s18, s0, 64
+; GFX10-NEXT:    s_sub_i32 s17, 64, s0
+; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
-; GFX10-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
+; GFX10-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
-; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT:    s_lshr_b32 s22, s5, 31
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX10-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
+; GFX10-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX10-NEXT:    s_andn2_b32 s8, 0x7f, s20
+; GFX10-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s9, s10, 64
-; GFX10-NEXT:    s_sub_i32 s11, 64, s10
-; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s11
-; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], s10
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s9
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
+; GFX10-NEXT:    s_not_b32 s16, s20
 ; GFX10-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[14:15], s9
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[14:15], s8
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[16:17]
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s9
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s16
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s18
+; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX10-NEXT:    s_and_b32 s4, s20, 0x7f
+; GFX10-NEXT:    s_sub_i32 s18, s4, 64
+; GFX10-NEXT:    s_sub_i32 s8, 64, s4
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], s20
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[14:15], s8
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[14:15], s20
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[14:15], s18
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[14:15]
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX10-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[16:17], 0x7f, s[16:17]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX11-NEXT:    s_mov_b32 s23, 0
+; GFX11-NEXT:    s_lshr_b32 s18, s1, 31
+; GFX11-NEXT:    s_mov_b32 s19, 0
+; GFX11-NEXT:    s_and_not1_b32 s17, 0x7f, s16
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s19, s16, 64
-; GFX11-NEXT:    s_sub_i32 s17, 64, s16
-; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
+; GFX11-NEXT:    s_not_b32 s18, s16
+; GFX11-NEXT:    s_sub_i32 s21, s17, 64
+; GFX11-NEXT:    s_sub_i32 s22, 64, s17
+; GFX11-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
-; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX11-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
+; GFX11-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
+; GFX11-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s22, s18, 64
-; GFX11-NEXT:    s_sub_i32 s19, 64, s18
-; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX11-NEXT:    s_and_b32 s0, s16, 0x7f
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s18, s0, 64
+; GFX11-NEXT:    s_sub_i32 s17, 64, s0
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
-; GFX11-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
+; GFX11-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX11-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
+; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
-; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT:    s_lshr_b32 s22, s5, 31
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX11-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
+; GFX11-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX11-NEXT:    s_and_not1_b32 s8, 0x7f, s20
+; GFX11-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s9, s10, 64
-; GFX11-NEXT:    s_sub_i32 s11, 64, s10
-; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s11
-; GFX11-NEXT:    s_lshl_b64 s[18:19], s[6:7], s10
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s9
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
+; GFX11-NEXT:    s_not_b32 s16, s20
 ; GFX11-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[14:15], s9
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[14:15], s8
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[16:17]
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
+; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s9
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s16
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s18
+; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX11-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX11-NEXT:    s_and_b32 s4, s20, 0x7f
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s18, s4, 64
+; GFX11-NEXT:    s_sub_i32 s8, 64, s4
+; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], s20
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[14:15], s8
+; GFX11-NEXT:    s_lshr_b64 s[16:17], s[14:15], s20
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[14:15], s18
 ; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[14:15]
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
 ; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX11-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -7649,68 +7711,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-LABEL: v_fshr_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX6-NEXT:    v_not_b32_e32 v16, v16
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX6-NEXT:    v_lshl_b64 v[16:17], v[0:1], 1
+; GFX6-NEXT:    v_lshl_b64 v[17:18], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v24
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[16:17], v0
-; GFX6-NEXT:    v_lshl_b64 v[18:19], v[2:3], v24
-; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v24
-; GFX6-NEXT:    v_lshl_b64 v[21:22], v[16:17], v24
-; GFX6-NEXT:    v_or_b32_e32 v18, v0, v18
-; GFX6-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[16:17], v25
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v23
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v23
+; GFX6-NEXT:    v_not_b32_e32 v0, v16
+; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v0
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v19
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[17:18], v0
+; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v19
+; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v19
+; GFX6-NEXT:    v_lshl_b64 v[23:24], v[17:18], v19
+; GFX6-NEXT:    v_or_b32_e32 v21, v0, v21
+; GFX6-NEXT:    v_or_b32_e32 v22, v1, v22
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[17:18], v25
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v23, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v23, 0, v24, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v21, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v22, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX6-NEXT:    v_and_b32_e32 v22, 0x7f, v16
+; GFX6-NEXT:    v_cndmask_b32_e32 v19, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v21, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v22
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v22
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[10:11], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v23
+; GFX6-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v22
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v24
-; GFX6-NEXT:    v_lshr_b64 v[16:17], v[10:11], v23
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX6-NEXT:    v_not_b32_e32 v8, v20
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v22
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX6-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX6-NEXT:    v_lshr_b64 v[16:17], v[10:11], v22
 ; GFX6-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v19
+; GFX6-NEXT:    v_not_b32_e32 v4, v20
+; GFX6-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v18
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[8:9], v4
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v19
-; GFX6-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX6-NEXT:    v_subrev_i32_e32 v20, vcc, 64, v19
-; GFX6-NEXT:    v_lshl_b64 v[16:17], v[8:9], v19
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v18
+; GFX6-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v19, vcc, 64, v18
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[8:9], v18
 ; GFX6-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX6-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX6-NEXT:    v_lshl_b64 v[4:5], v[8:9], v20
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT:    v_lshl_b64 v[4:5], v[8:9], v19
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
 ; GFX6-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v18
@@ -7729,8 +7791,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v22, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v23, v1
+; GFX6-NEXT:    v_or_b32_e32 v3, v21, v3
 ; GFX6-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX6-NEXT:    v_or_b32_e32 v6, v10, v6
@@ -7740,68 +7802,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-LABEL: v_fshr_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX8-NEXT:    v_not_b32_e32 v16, v16
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX8-NEXT:    v_lshlrev_b64 v[16:17], 1, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[17:18], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v24
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[16:17]
-; GFX8-NEXT:    v_lshlrev_b64 v[18:19], v24, v[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v24
-; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v24, v[16:17]
-; GFX8-NEXT:    v_or_b32_e32 v18, v0, v18
-; GFX8-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[16:17]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v23
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v23, v[8:9]
+; GFX8-NEXT:    v_not_b32_e32 v0, v16
+; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v0
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v19
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[17:18]
+; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v19
+; GFX8-NEXT:    v_lshlrev_b64 v[23:24], v19, v[17:18]
+; GFX8-NEXT:    v_or_b32_e32 v21, v0, v21
+; GFX8-NEXT:    v_or_b32_e32 v22, v1, v22
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[17:18]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v23, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, 0, v24, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v21, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v22, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_and_b32_e32 v22, 0x7f, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v22
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v22, v[8:9]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX8-NEXT:    v_subrev_u32_e32 v24, vcc, 64, v23
+; GFX8-NEXT:    v_subrev_u32_e32 v24, vcc, 64, v22
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v23, v[10:11]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX8-NEXT:    v_not_b32_e32 v8, v20
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v22
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v22, v[10:11]
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v19
+; GFX8-NEXT:    v_not_b32_e32 v4, v20
+; GFX8-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v18
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v19, v[6:7]
-; GFX8-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX8-NEXT:    v_subrev_u32_e32 v20, vcc, 64, v19
-; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v19, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
+; GFX8-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, 64, v18
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v18, v[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX8-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v20, v[8:9]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v19, v[8:9]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v18
@@ -7820,8 +7882,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v22, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v23, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v21, v3
 ; GFX8-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX8-NEXT:    v_or_b32_e32 v6, v10, v6
@@ -7831,68 +7893,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-LABEL: v_fshr_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX9-NEXT:    v_not_b32_e32 v16, v16
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX9-NEXT:    v_lshlrev_b64 v[16:17], 1, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[17:18], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v24
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[16:17]
-; GFX9-NEXT:    v_lshlrev_b64 v[18:19], v24, v[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v24
-; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v24, v[16:17]
-; GFX9-NEXT:    v_or_b32_e32 v18, v0, v18
-; GFX9-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v25, v[16:17]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v23, v[8:9]
+; GFX9-NEXT:    v_not_b32_e32 v0, v16
+; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v19
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[17:18]
+; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v19
+; GFX9-NEXT:    v_lshlrev_b64 v[23:24], v19, v[17:18]
+; GFX9-NEXT:    v_or_b32_e32 v21, v0, v21
+; GFX9-NEXT:    v_or_b32_e32 v22, v1, v22
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v25, v[17:18]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v23, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, 0, v24, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v21, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v22, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT:    v_and_b32_e32 v22, 0x7f, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v0, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v22
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v22, v[8:9]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX9-NEXT:    v_subrev_u32_e32 v24, 64, v23
+; GFX9-NEXT:    v_subrev_u32_e32 v24, 64, v22
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v23, v[10:11]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX9-NEXT:    v_not_b32_e32 v8, v20
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v22
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX9-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v19
+; GFX9-NEXT:    v_not_b32_e32 v4, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v22, v[10:11]
+; GFX9-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v4
+; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v18
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v19, v[6:7]
-; GFX9-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX9-NEXT:    v_subrev_u32_e32 v20, 64, v19
-; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v19, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
+; GFX9-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v19, 64, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v18, v[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX9-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v20, v[8:9]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v19, v[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
 ; GFX9-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v18
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
@@ -7911,8 +7973,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v22, v1
+; GFX9-NEXT:    v_or_b32_e32 v1, v23, v1
+; GFX9-NEXT:    v_or_b32_e32 v3, v21, v3
 ; GFX9-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v10, v6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 404e726246f4d..81abe91b283f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -2787,52 +2787,51 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-LABEL: v_sdiv_v2i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT:    v_and_b32_e32 v4, 0xffffff, v6
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v5
+; CGP-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v6
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v5
 ; CGP-NEXT:    v_rcp_f32_e32 v1, v1
-; CGP-NEXT:    v_and_b32_e32 v7, 0xffffff, v0
+; CGP-NEXT:    v_rcp_f32_e32 v7, v3
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; CGP-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v5
-; CGP-NEXT:    v_rcp_f32_e32 v8, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
-; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v8
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT:    v_mov_b32_e32 v0, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; CGP-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v1
+; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0
+; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v0
+; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v0
+; CGP-NEXT:    v_mov_b32_e32 v0, v4
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v6
+; CGP-NEXT:    v_mul_lo_u32 v4, v1, v5
+; CGP-NEXT:    v_mul_lo_u32 v0, v0, v7
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v1
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v1, v8, vcc
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT:    v_mov_b32_e32 v5, v1
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v1, v5, v3
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; CGP-NEXT:    v_sub_i32_e64 v8, s[4:5], v3, v5
 ; CGP-NEXT:    v_mov_b32_e32 v0, v1
-; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v6, v0
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v7, v0
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
-; CGP-NEXT:    v_sub_i32_e64 v8, s[4:5], v7, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v3, v8, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v4
 ; CGP-NEXT:    v_mov_b32_e32 v7, v1
-; CGP-NEXT:    v_mul_lo_u32 v8, v7, v4
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT:    v_mul_lo_u32 v8, v7, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v7
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v2, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 3729f1cc2b12d..183f2edbf9035 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -563,18 +563,21 @@ define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0x3fff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_i32_zext_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0x3fff
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i32_zext_i16:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0x3fff
 ; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %and = and i16 %x, 16383
   %ext = zext i16 %and to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 5b94e71ecf52e..cfac0c2fa56aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3286,45 +3286,45 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-LABEL: v_srem_v2i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT:    v_and_b32_e32 v4, 0xffffff, v6
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v5
+; CGP-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v6
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v5
 ; CGP-NEXT:    v_rcp_f32_e32 v1, v1
-; CGP-NEXT:    v_and_b32_e32 v7, 0xffffff, v0
+; CGP-NEXT:    v_rcp_f32_e32 v7, v3
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; CGP-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v5
-; CGP-NEXT:    v_rcp_f32_e32 v8, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
-; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v8
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT:    v_mov_b32_e32 v0, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; CGP-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v1
+; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0
+; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v0
+; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v0
+; CGP-NEXT:    v_mov_b32_e32 v0, v4
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v6
+; CGP-NEXT:    v_mul_lo_u32 v0, v0, v7
+; CGP-NEXT:    v_mul_lo_u32 v4, v1, v5
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
 ; CGP-NEXT:    v_mov_b32_e32 v0, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v5, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v0, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v4
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v3, v5
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v4, v1, v6
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v5
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v4
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index e31d8e95bd608..1ee521b3dedac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -2147,26 +2147,26 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
-; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v4
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; CGP-NEXT:    v_rcp_f32_e32 v4, v2
+; CGP-NEXT:    v_rcp_f32_e32 v4, v1
 ; CGP-NEXT:    v_rcp_f32_e32 v5, v3
 ; CGP-NEXT:    v_mul_f32_e32 v4, v0, v4
-; CGP-NEXT:    v_mul_f32_e32 v5, v1, v5
+; CGP-NEXT:    v_mul_f32_e32 v5, v2, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v4, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v5, v5
-; CGP-NEXT:    v_mad_f32 v0, -v4, v2, v0
+; CGP-NEXT:    v_mad_f32 v0, -v4, v1, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_mad_f32 v1, -v5, v3, v1
+; CGP-NEXT:    v_mad_f32 v2, -v5, v3, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v0|, v2
+; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v0|, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, v3
+; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index f30b278b3e611..a7e5ce3d21619 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -2561,12 +2561,12 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
-; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v4
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v3
 ; CGP-NEXT:    v_rcp_f32_e32 v8, v5
 ; CGP-NEXT:    v_rcp_f32_e32 v9, v7
@@ -2584,10 +2584,10 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_mul_lo_u32 v2, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v1, v4, v1
 ; CGP-NEXT:    v_mul_lo_u32 v3, v5, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v2, v3
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v1
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index 9ea9fa91e4f92..1b35a89ad7f93 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -278,7 +278,6 @@ define amdgpu_ps i64 @s_csh_64_0(i64 inreg %a, i64 inreg %b) {
 ;
 ; GISEL-LABEL: s_csh_64_0:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], 63
 ; GISEL-NEXT:    s_lshl_b64 s[4:5], s[0:1], s2
 ; GISEL-NEXT:    s_lshr_b64 s[6:7], s[0:1], s2
 ; GISEL-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
@@ -310,7 +309,6 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) {
 ;
 ; GISEL-LABEL: s_csh_64_1:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], 0xff
 ; GISEL-NEXT:    s_lshl_b64 s[4:5], s[0:1], s2
 ; GISEL-NEXT:    s_lshr_b64 s[6:7], s[0:1], s2
 ; GISEL-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index a0b549711f339..93e14a205f05d 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1592,7 +1592,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
+; GFX10-GISEL-NEXT:    v_sub_nc_u16 v1, v1, 24
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
@@ -1837,7 +1837,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 25, v1
+; GFX10-GISEL-NEXT:    v_sub_nc_u16 v1, v1, 25
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll
index c27118446cc2f..cafd903df2d56 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll
@@ -1,47 +1,79 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_x:
-; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0
 define amdgpu_ps half @buffer_load_format_d16_x(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_x:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    buffer_load_format_d16_x v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_x:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    buffer_load_format_d16_x v0, off, s[0:3], 0
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call half @llvm.amdgcn.raw.ptr.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   ret half %data
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xy:
-; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
 define amdgpu_ps half @buffer_load_format_d16_xy(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xy:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    buffer_load_format_d16_xy v[0:1], off, s[0:3], 0 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xy:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    buffer_load_format_d16_xy v0, off, s[0:3], 0
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   %elt = extractelement <2 x half> %data, i32 1
   ret half %elt
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
-; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 define amdgpu_ps half @buffer_load_format_d16_xyz(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xyz:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    buffer_load_format_d16_xyz v[0:2], off, s[0:3], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v2 ; encoding: [0x02,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xyz:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    buffer_load_format_d16_xyz v[0:1], off, s[0:3], 0
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_mov_b32_e32 v0, v1
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <3 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   %elt = extractelement <3 x half> %data, i32 2
   ret half %elt
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
-; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
 define amdgpu_ps half @buffer_load_format_d16_xyzw(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xyzw:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    buffer_load_format_d16_xyzw v[0:3], off, s[0:3], 0 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xyzw:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    buffer_load_format_d16_xyzw v[0:1], off, s[0:3], 0
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   %elt = extractelement <4 x half> %data, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll
index 3a396b54f89ab..39df6ec679e88 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll
@@ -1,57 +1,107 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_x:
-; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_ps half @buffer_load_format_d16_x(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_x:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; UNPACKED-NEXT:    buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x20,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_x:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    v_mov_b32_e32 v0, 0
+; PACKED-NEXT:    buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call half @llvm.amdgcn.struct.ptr.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   ret half %data
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xy:
-; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
 define amdgpu_ps half @buffer_load_format_d16_xy(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xy:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; UNPACKED-NEXT:    buffer_load_format_d16_xy v[0:1], v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x24,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xy:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    v_mov_b32_e32 v0, 0
+; PACKED-NEXT:    buffer_load_format_d16_xy v0, v0, s[0:3], 0 idxen
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %elt = extractelement <2 x half> %data, i32 1
   ret half %elt
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
-; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 define amdgpu_ps half @buffer_load_format_d16_xyz(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xyz:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; UNPACKED-NEXT:    buffer_load_format_d16_xyz v[0:2], v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x28,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v2 ; encoding: [0x02,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xyz:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    v_mov_b32_e32 v0, 0
+; PACKED-NEXT:    buffer_load_format_d16_xyz v[0:1], v0, s[0:3], 0 idxen
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_mov_b32_e32 v0, v1
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <3 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %elt = extractelement <3 x half> %data, i32 2
   ret half %elt
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
-; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
 define amdgpu_ps half @buffer_load_format_d16_xyzw(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xyzw:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; UNPACKED-NEXT:    buffer_load_format_d16_xyzw v[0:3], v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x2c,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xyzw:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    v_mov_b32_e32 v0, 0
+; PACKED-NEXT:    buffer_load_format_d16_xyzw v[0:1], v0, s[0:3], 0 idxen
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %elt = extractelement <4 x half> %data, i32 3
   ret half %elt
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_i16_x:
-; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_ps half @buffer_load_format_i16_x(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_i16_x:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; UNPACKED-NEXT:    buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x20,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_i16_x:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    v_mov_b32_e32 v0, 0
+; PACKED-NEXT:    buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call i16 @llvm.amdgcn.struct.ptr.buffer.load.format.i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %fdata = bitcast i16 %data to half
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
index 2f9e6b0a1cf52..55600cab8432b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
@@ -1,12 +1,16 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,SI
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,VI
 
-;CHECK-LABEL: {{^}}buffer_load:
-;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_load_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
-;CHECK: s_waitcnt
 define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v8, 0
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v8, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_load_dwordx4 v[4:7], v8, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_dwordx4 v[8:11], v8, s[0:3], 0 idxen slc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
   %data_glc = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -17,106 +21,165 @@ main_body:
   ret {<4 x float>, <4 x float>, <4 x float>} %r2
 }
 
-;CHECK-LABEL: {{^}}buffer_load_immoffs:
-;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_immoffs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 40, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
-;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
-;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_immoffs_large:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    s_movk_i32 s4, 0x1ffc
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[0:3], s4 idxen offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 4, i32 8188, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_idx:
-;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_idx:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_ofs:
-;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
-;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs_imm:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %ofs = add i32 %1, 60
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_both:
-;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_both_reversed:
-;CHECK: v_mov_b32_e32 v2, v0
-;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both_reversed:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_x1:
-;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
 define amdgpu_ps float @buffer_load_x1(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x1:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   ret float %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_x2:
-;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
 define amdgpu_ps <2 x float> @buffer_load_x2(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x2:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   ret <2 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_negative_offset:
-;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, -16, v0
-;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen
 define amdgpu_ps <4 x float> @buffer_load_negative_offset(ptr addrspace(8) inreg, i32 %ofs) {
+; SI-LABEL: buffer_load_negative_offset:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_mov_b32 s4, 0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, -16, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: buffer_load_negative_offset:
+; VI:       ; %bb.0: ; %main_body
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    ; return to shader part epilog
 main_body:
   %ofs.1 = add i32 %ofs, -16
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs.1, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-; SI won't merge ds memory operations, because of the signed offset bug, so
-; we only have check lines for VI.
-; CHECK-LABEL: buffer_load_mmo:
-; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
+; SI won't merge ds memory operations, because of the signed offset bug.
 define amdgpu_ps float @buffer_load_mmo(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %lds) {
+; SI-LABEL: buffer_load_mmo:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 idxen
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_write_b32 v0, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
+; SI-NEXT:    ds_write_b32 v0, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: buffer_load_mmo:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 idxen
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    ds_write2_b32 v0, v2, v2 offset1:4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, v1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ; return to shader part epilog
 entry:
   store float 0.0, ptr addrspace(3) %lds
   %val = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -125,12 +188,15 @@ entry:
   ret float %val
 }
 
-;CHECK-LABEL: {{^}}buffer_load_int:
-;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_load_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
-;CHECK: s_waitcnt
 define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_int:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_load_dwordx2 v[4:5], v6, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 idxen slc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x i32> @llvm.amdgcn.struct.ptr.buffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
   %data_glc = call <2 x i32> @llvm.amdgcn.struct.ptr.buffer.load.v2i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -144,13 +210,13 @@ main_body:
   ret {<4 x float>, <2 x float>, float} %r2
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_ubyte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
 define amdgpu_ps float @struct_ptr_buffer_load_ubyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_ubyte:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %tmp = call i8 @llvm.amdgcn.struct.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = zext i8 %tmp to i32
@@ -158,13 +224,13 @@ main_body:
   ret float %val
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_ushort:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
 define amdgpu_ps float @struct_ptr_buffer_load_ushort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_ushort:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %tmp = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = zext i16 %tmp to i32
@@ -172,13 +238,13 @@ main_body:
   ret float %val
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_sbyte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
 define amdgpu_ps float @struct_ptr_buffer_load_sbyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_sbyte:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_sbyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %tmp = call i8 @llvm.amdgcn.struct.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = sext i8 %tmp to i32
@@ -186,13 +252,13 @@ main_body:
   ret float %val
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_sshort:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
 define amdgpu_ps float @struct_ptr_buffer_load_sshort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_sshort:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_sshort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %tmp = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = sext i16 %tmp to i32
@@ -200,72 +266,84 @@ main_body:
   ret float %val
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_f16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b16 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call half @llvm.amdgcn.struct.ptr.buffer.load.f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store half %val, ptr addrspace(3) %ptr
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v2f16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b32 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v2f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b32 v0, v1
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <2 x half> %val, ptr addrspace(3) %ptr
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v4f16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b64 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v4f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[1:2]
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <4 x half> %val, ptr addrspace(3) %ptr
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_i16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b16 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store i16 %val, ptr addrspace(3) %ptr
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v2i16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b32 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v2i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b32 v0, v1
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call <2 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v2i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <2 x i16> %val, ptr addrspace(3) %ptr
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v4i16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b64 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v4i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[1:2]
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call <4 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <4 x i16> %val, ptr addrspace(3) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
index 8109fca4a043a..58b422dd6a751 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
@@ -1,85 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
 
-; GCN-LABEL: {{^}}buffer_store_format_d16_x:
-; GCN: s_load_dword s[[LO:[0-9]+]]
-; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
-; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_x(ptr addrspace(8) %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) {
+; GCN-LABEL: buffer_store_format_d16_x:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_load_dword s4, s[6:7], 0x30
+; GCN-NEXT:    s_load_dword s5, s[6:7], 0x54
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_format_d16_x v0, v1, s[0:3], 0 idxen
+; GCN-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.f16(half %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-; GCN-LABEL: {{^}}buffer_store_format_d16_xy:
-
-; UNPACKED: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}}
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-
-; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %index) {
+; UNPACKED-LABEL: buffer_store_format_d16_xy:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; UNPACKED-NEXT:    s_lshr_b32 s6, s4, 16
+; UNPACKED-NEXT:    s_and_b32 s4, s4, 0xffff
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, s4
+; UNPACKED-NEXT:    v_mov_b32_e32 v1, s6
+; UNPACKED-NEXT:    v_mov_b32_e32 v2, s5
+; UNPACKED-NEXT:    buffer_store_format_d16_xy v[0:1], v2, s[0:3], 0 idxen
+; UNPACKED-NEXT:    s_endpgm
+;
+; PACKED-LABEL: buffer_store_format_d16_xy:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-NEXT:    v_mov_b32_e32 v0, s4
+; PACKED-NEXT:    v_mov_b32_e32 v1, s5
+; PACKED-NEXT:    buffer_store_format_d16_xy v0, v1, s[0:3], 0 idxen
+; PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-; GCN-LABEL: {{^}}buffer_store_format_d16_xyz:
-; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-
-; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-
-; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
-
-; UNPACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-
-; PACKED: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
-; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]]
-
-; PACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) {
+; UNPACKED-LABEL: buffer_store_format_d16_xyz:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; UNPACKED-NEXT:    s_load_dword s6, s[6:7], 0x18
+; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
+; UNPACKED-NEXT:    s_lshr_b32 s7, s4, 16
+; UNPACKED-NEXT:    s_and_b32 s4, s4, 0xffff
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, s4
+; UNPACKED-NEXT:    v_mov_b32_e32 v1, s7
+; UNPACKED-NEXT:    v_mov_b32_e32 v2, s5
+; UNPACKED-NEXT:    v_mov_b32_e32 v3, s6
+; UNPACKED-NEXT:    buffer_store_format_d16_xyz v[0:2], v3, s[0:3], 0 idxen
+; UNPACKED-NEXT:    s_endpgm
+;
+; PACKED-LABEL: buffer_store_format_d16_xyz:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; PACKED-NEXT:    s_load_dword s8, s[6:7], 0x18
+; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
+; PACKED-NEXT:    v_mov_b32_e32 v0, s4
+; PACKED-NEXT:    v_mov_b32_e32 v1, s5
+; PACKED-NEXT:    v_mov_b32_e32 v2, s8
+; PACKED-NEXT:    buffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 idxen
+; PACKED-NEXT:    s_endpgm
 main_body:
   %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v3f16(<3 x half> %data_subvec, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-
-; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
-; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-
-; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-
-; UNPACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-
-; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
-; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
-
-; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) {
+; UNPACKED-LABEL: buffer_store_format_d16_xyzw:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; UNPACKED-NEXT:    s_load_dword s6, s[6:7], 0x18
+; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; UNPACKED-NEXT:    s_lshr_b32 s7, s5, 16
+; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
+; UNPACKED-NEXT:    s_lshr_b32 s8, s4, 16
+; UNPACKED-NEXT:    s_and_b32 s4, s4, 0xffff
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, s4
+; UNPACKED-NEXT:    v_mov_b32_e32 v1, s8
+; UNPACKED-NEXT:    v_mov_b32_e32 v2, s5
+; UNPACKED-NEXT:    v_mov_b32_e32 v3, s7
+; UNPACKED-NEXT:    v_mov_b32_e32 v4, s6
+; UNPACKED-NEXT:    buffer_store_format_d16_xyzw v[0:3], v4, s[0:3], 0 idxen
+; UNPACKED-NEXT:    s_endpgm
+;
+; PACKED-LABEL: buffer_store_format_d16_xyzw:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; PACKED-NEXT:    s_load_dword s8, s[6:7], 0x18
+; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-NEXT:    v_mov_b32_e32 v0, s4
+; PACKED-NEXT:    v_mov_b32_e32 v1, s5
+; PACKED-NEXT:    v_mov_b32_e32 v2, s8
+; PACKED-NEXT:    buffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 idxen
+; PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-; GCN-LABEL: {{^}}buffer_store_format_i16_x:
-; GCN: s_load_dword s[[LO:[0-9]+]]
-; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
-; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_i16_x(ptr addrspace(8) %rsrc, [8 x i32], i16 %data, [8 x i32], i32 %index) {
+; GCN-LABEL: buffer_store_format_i16_x:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_load_dword s4, s[6:7], 0x30
+; GCN-NEXT:    s_load_dword s5, s[6:7], 0x54
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_format_d16_x v0, v1, s[0:3], 0 idxen
+; GCN-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.i16(i16 %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll
index 13217b24dcd4b..61a08d96986b0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll
@@ -1,12 +1,15 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=CHECK,SI %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VI %s
 
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_store_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_store_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
 define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
+; CHECK-LABEL: buffer_store:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v12, 0
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v12, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_store_format_xyzw v[4:7], v12, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_store_format_xyzw v[8:11], v12, s[0:3], 0 idxen slc
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -14,47 +17,56 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
 define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) {
+; CHECK-LABEL: buffer_store_immoffs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen offset:42
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_idx:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float>, i32) {
+; CHECK-LABEL: buffer_store_idx:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) {
+; CHECK-LABEL: buffer_store_ofs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v4
+; CHECK-NEXT:    v_mov_b32_e32 v4, s4
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 %2, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_both:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float>, i32, i32) {
+; CHECK-LABEL: buffer_store_both:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 %3, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_both_reversed:
-;CHECK: v_mov_b32_e32 v6, v4
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_both_reversed(ptr addrspace(8) inreg, <4 x float>, i32, i32) {
+; CHECK-LABEL: buffer_store_both_reversed:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %3, i32 %2, i32 0, i32 0)
   ret void
@@ -62,14 +74,23 @@ main_body:
 
 ; Ideally, the register allocator would avoid the wait here
 ;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) {
+; SI-LABEL: buffer_store_wait:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: buffer_store_wait:
+; VI:       ; %bb.0: ; %main_body
+; VI-NEXT:    buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
+; VI-NEXT:    buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0)
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0, i32 0)
@@ -77,28 +98,31 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %index) {
+; CHECK-LABEL: buffer_store_x1:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_format_x v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.f32(float %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1_i32:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x1_i32(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %index) {
+; CHECK-LABEL: buffer_store_x1_i32:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_format_x v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %index) {
+; CHECK-LABEL: buffer_store_x2:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll
index e52af31360764..d08623f685e85 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll
@@ -1,12 +1,15 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=CHECK,SI %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VI %s
+
 define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
+; CHECK-LABEL: buffer_store:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v12, 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v12, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_store_dwordx4 v[4:7], v12, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_store_dwordx4 v[8:11], v12, s[0:3], 0 idxen slc
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -14,62 +17,79 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
 define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) {
+; CHECK-LABEL: buffer_store_immoffs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen offset:42
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_idx:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float>, i32) {
+; CHECK-LABEL: buffer_store_idx:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) {
+; CHECK-LABEL: buffer_store_ofs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v4
+; CHECK-NEXT:    v_mov_b32_e32 v4, s4
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 %2, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_both:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float>, i32, i32) {
+; CHECK-LABEL: buffer_store_both:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 %3, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_both_reversed:
-;CHECK: v_mov_b32_e32 v6, v4
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_both_reversed(ptr addrspace(8) inreg, <4 x float>, i32, i32) {
+; CHECK-LABEL: buffer_store_both_reversed:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %3, i32 %2, i32 0, i32 0)
   ret void
 }
 
 ; Ideally, the register allocator would avoid the wait here
-;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) {
+; SI-LABEL: buffer_store_wait:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: buffer_store_wait:
+; VI:       ; %bb.0: ; %main_body
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0)
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0, i32 0)
@@ -77,30 +97,34 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %index) {
+; CHECK-LABEL: buffer_store_x1:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %index) #0 {
+; CHECK-LABEL: buffer_store_x2:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_int:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
 define amdgpu_ps void @buffer_store_int(ptr addrspace(8) inreg, <4 x i32>, <2 x i32>, i32) {
+; CHECK-LABEL: buffer_store_int:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v7, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_store_dwordx2 v[4:5], v7, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 idxen slc
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4i32(<4 x i32> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.ptr.buffer.store.v2i32(<2 x i32> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -108,12 +132,12 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_byte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @struct_ptr_buffer_store_byte(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) {
+; CHECK-LABEL: struct_ptr_buffer_store_byte:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    buffer_store_byte v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i8
@@ -121,39 +145,63 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_f16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_f16_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @struct_ptr_buffer_store_f16(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) {
+; CHECK-LABEL: struct_ptr_buffer_store_f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT:    buffer_store_short v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
   %v2 = fptrunc float %v1 to half
   call void @llvm.amdgcn.struct.ptr.buffer.store.f16(half %v2, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_v2f16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_ptr_buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %v1, i32 %index) {
+; SI-LABEL: struct_ptr_buffer_store_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: struct_ptr_buffer_store_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.ptr.buffer.store.v2f16(<2 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_v4f16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_ptr_buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %v1, i32 %index) {
+; SI-LABEL: struct_ptr_buffer_store_v4f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: struct_ptr_buffer_store_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f16(<4 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_i16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @struct_ptr_buffer_store_i16(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) {
+; CHECK-LABEL: struct_ptr_buffer_store_i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    buffer_store_short v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i16
@@ -161,18 +209,39 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_vif16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_ptr_buffer_store_vif16(ptr addrspace(8) inreg %rsrc, <2 x i16> %v1, i32 %index) {
+; SI-LABEL: struct_ptr_buffer_store_vif16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: struct_ptr_buffer_store_vif16:
+; VI:       ; %bb.0:
+; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.ptr.buffer.store.v2i16(<2 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_v4i16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_ptr_buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %v1, i32 %index) {
+; SI-LABEL: struct_ptr_buffer_store_v4i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_or_b32_e32 v1, v0, v1
+; SI-NEXT:    buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: struct_ptr_buffer_store_v4i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4i16(<4 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
new file mode 100644
index 0000000000000..506f40516c9e6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -0,0 +1,1736 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define half @v_maximumnum_f16(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call half @llvm.maximumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_nnan(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan half @llvm.maximumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_1.0(half %x) {
+; GFX8-LABEL: v_maximumnum_f16_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call half @llvm.maximumnum.f16(half %x, half 1.0)
+  ret half %result
+}
+
+define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
+; GFX8-LABEL: v_maximumnum_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GFX12-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
+  ret bfloat %result
+}
+
+define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
+; GFX8-LABEL: v_maximumnum_bf16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_bf16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_bf16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_bf16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_bf16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
+  ret bfloat %result
+}
+
+define float @v_maximumnum_f32(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_nnan(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define double @v_maximumnum_f64(double %x, double %y) {
+; GFX8-LABEL: v_maximumnum_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call double @llvm.maximumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_maximumnum_f64_nnan(double %x, double %y) {
+; GFX8-LABEL: v_maximumnum_f64_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan double @llvm.maximumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define float @v_maximumnum_f32_1.0(float %x) {
+; GFX8-LABEL: v_maximumnum_f32_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.maximumnum.f32(float %x, float 1.0)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_rhs_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.y = call float @llvm.canonicalize.f32(float %y)
+  %result = call float @llvm.maximumnum.f32(float %x, float %canon.y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_lhs_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.x = call float @llvm.canonicalize.f32(float %x)
+  %result = call float @llvm.maximumnum.f32(float %canon.x, float %y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_both_operands_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.x = call float @llvm.canonicalize.f32(float %x)
+  %canon.y = call float @llvm.canonicalize.f32(float %y)
+  %result = call float @llvm.maximumnum.f32(float %canon.x, float %canon.y)
+  ret float %result
+}
+
+define double @v_maximumnum_f64_1.0(double %x) {
+; GFX8-LABEL: v_maximumnum_f64_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], 1.0, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call double @llvm.maximumnum.f64(double %x, double 1.0)
+  ret double %result
+}
+
+define half @v_maximumnum_f16_s_v(half inreg %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX8-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_s_v:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_s_v:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_s_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call half @llvm.maximumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_v_s(half %x, half inreg %y) {
+; GFX8-LABEL: v_maximumnum_f16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call half @llvm.maximumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) {
+; GFX8-LABEL: v_maximumnum_f16_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX8-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX8-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX9-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX10-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v0, s1, s1
+; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v0, s1, s1
+; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call half @llvm.maximumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define float @v_maximumnum_f32_s_v(float inreg %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_s_v:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_s_v:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_s_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_v_s(float %x, float inreg %y) {
+; GFX8-LABEL: v_maximumnum_f32_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_s_s(float inreg %x, float inreg %y) {
+; GFX8-LABEL: v_maximumnum_f32_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v0, 1.0, s7
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v0, s7, s7
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v0, s7, s7
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v0, s1, s1
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v0, s1, s1
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define double @v_maximumnum_f64_s_v(double inreg %x, double %y) {
+; GFX8-LABEL: v_maximumnum_f64_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_s_v:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_s_v:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_s_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.maximumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_maximumnum_f64_v_s(double %x, double inreg %y) {
+; GFX8-LABEL: v_maximumnum_f64_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.maximumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_maximumnum_f64_s_s(double inreg %x, double inreg %y) {
+; GFX8-LABEL: v_maximumnum_f64_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], s[2:3], s[2:3]
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.maximumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define float @v_maximumnum_f32_fabs_rhs(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %result = call float @llvm.maximumnum.f32(float %x, float %fabs.y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_fneg_fabs_rhs(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, -1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, -|v1|, -|v1|
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %fneg.fabs.y = fneg float %fabs.y
+  %result = call float @llvm.maximumnum.f32(float %x, float %fneg.fabs.y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_fabs(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_fabs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e64 v0, 1.0, |v0|
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_fabs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_fabs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_fabs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f32_e64 v0, |v0|, |v0|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %result = call float @llvm.maximumnum.f32(float %fabs.x, float %fabs.y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_fneg(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, -1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, -1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX9-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX10-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX11-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, -v1, -v1
+; GFX12-NEXT:    v_max_num_f32_e64 v0, -v0, -v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg float %x
+  %fneg.y = fneg float %y
+  %result = call float @llvm.maximumnum.f32(float %fneg.x, float %fneg.y)
+  ret float %result
+}
+
+define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %result = call half @llvm.maximumnum.f16(half %x, half %fabs.y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1|
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %fneg.fabs.y = fneg half %fabs.y
+  %result = call half @llvm.maximumnum.f16(half %x, half %fneg.fabs.y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_fabs(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_fabs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX8-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_fabs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_fabs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_fabs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f16_e64 v0, |v0|, |v0|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.x = call half @llvm.fabs.f16(half %x)
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %result = call half @llvm.maximumnum.f16(half %fabs.x, half %fabs.y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_fneg(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX8-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX9-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX10-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, -v1, -v1
+; GFX12-NEXT:    v_max_num_f16_e64 v0, -v0, -v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg half %x
+  %fneg.y = fneg half %y
+  %result = call half @llvm.maximumnum.f16(half %fneg.x, half %fneg.y)
+  ret half %result
+}
+
+define double @v_maximumnum_f64_fneg(double %x, double %y) {
+; GFX8-LABEL: v_maximumnum_f64_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX10-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX11-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], -v[2:3], -v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg double %x
+  %fneg.y = fneg double %y
+  %result = call double @llvm.maximumnum.f64(double %fneg.x, double %fneg.y)
+  ret double %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
new file mode 100644
index 0000000000000..a2ba770067d16
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -0,0 +1,1690 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define half @v_minimumnum_f16(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call half @llvm.minimumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_nnan(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan half @llvm.minimumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_1.0(half %x) {
+; GFX8-LABEL: v_minimumnum_f16_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call half @llvm.minimumnum.f16(half %x, half 1.0)
+  ret half %result
+}
+
+define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
+; GFX8-LABEL: v_minimumnum_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GFX12-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
+  ret bfloat %result
+}
+
+define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
+; GFX8-LABEL: v_minimumnum_bf16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_bf16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_bf16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_bf16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_bf16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
+  ret bfloat %result
+}
+
+define float @v_minimumnum_f32(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_nnan(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define double @v_minimumnum_f64(double %x, double %y) {
+; GFX8-LABEL: v_minimumnum_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_minimumnum_f64_nnan(double %x, double %y) {
+; GFX8-LABEL: v_minimumnum_f64_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define float @v_minimumnum_f32_1.0(float %x) {
+; GFX8-LABEL: v_minimumnum_f32_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.minimumnum.f32(float %x, float 1.0)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_rhs_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.y = call float @llvm.canonicalize.f32(float %y)
+  %result = call float @llvm.minimumnum.f32(float %x, float %canon.y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_lhs_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.x = call float @llvm.canonicalize.f32(float %x)
+  %result = call float @llvm.minimumnum.f32(float %canon.x, float %y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_both_operands_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.x = call float @llvm.canonicalize.f32(float %x)
+  %canon.y = call float @llvm.canonicalize.f32(float %y)
+  %result = call float @llvm.minimumnum.f32(float %canon.x, float %canon.y)
+  ret float %result
+}
+
+define double @v_minimumnum_f64_1.0(double %x) {
+; GFX8-LABEL: v_minimumnum_f64_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], 1.0, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call double @llvm.minimumnum.f64(double %x, double 1.0)
+  ret double %result
+}
+
+define half @v_minimumnum_f16_v_s(half %x, half inreg %y) {
+; GFX8-LABEL: v_minimumnum_f16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call half @llvm.minimumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) {
+; GFX8-LABEL: v_minimumnum_f16_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX8-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX8-NEXT:    v_min_f16_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX9-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX9-NEXT:    v_min_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX10-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX10-NEXT:    v_min_f16_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v0, s1, s1
+; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v0, s1, s1
+; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call half @llvm.minimumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define float @v_minimumnum_f32_s_v(float inreg %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_s_v:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_s_v:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_s_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_v_s(float %x, float inreg %y) {
+; GFX8-LABEL: v_minimumnum_f32_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_s_s(float inreg %x, float inreg %y) {
+; GFX8-LABEL: v_minimumnum_f32_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v0, 1.0, s7
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v0, s7, s7
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v0, s7, s7
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v0, s1, s1
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v0, s1, s1
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define double @v_minimumnum_f64_s_v(double inreg %x, double %y) {
+; GFX8-LABEL: v_minimumnum_f64_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_s_v:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_s_v:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_s_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_minimumnum_f64_v_s(double %x, double inreg %y) {
+; GFX8-LABEL: v_minimumnum_f64_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_minimumnum_f64_s_s(double inreg %x, double inreg %y) {
+; GFX8-LABEL: v_minimumnum_f64_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], s[2:3], s[2:3]
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define float @v_minimumnum_f32_fabs_rhs(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %result = call float @llvm.minimumnum.f32(float %x, float %fabs.y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_fneg_fabs_rhs(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, -1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, -|v1|, -|v1|
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %fneg.fabs.y = fneg float %fabs.y
+  %result = call float @llvm.minimumnum.f32(float %x, float %fneg.fabs.y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_fabs(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_fabs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e64 v0, 1.0, |v0|
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_fabs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_fabs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_fabs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f32_e64 v0, |v0|, |v0|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %result = call float @llvm.minimumnum.f32(float %fabs.x, float %fabs.y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_fneg(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, -1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, -1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX9-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX10-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX11-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, -v1, -v1
+; GFX12-NEXT:    v_max_num_f32_e64 v0, -v0, -v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg float %x
+  %fneg.y = fneg float %y
+  %result = call float @llvm.minimumnum.f32(float %fneg.x, float %fneg.y)
+  ret float %result
+}
+
+define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %result = call half @llvm.minimumnum.f16(half %x, half %fabs.y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1|
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %fneg.fabs.y = fneg half %fabs.y
+  %result = call half @llvm.minimumnum.f16(half %x, half %fneg.fabs.y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_fabs(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16_fabs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX8-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_fabs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_fabs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_fabs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f16_e64 v0, |v0|, |v0|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.x = call half @llvm.fabs.f16(half %x)
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %result = call half @llvm.minimumnum.f16(half %fabs.x, half %fabs.y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_fneg(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX8-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX9-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX10-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, -v1, -v1
+; GFX12-NEXT:    v_max_num_f16_e64 v0, -v0, -v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg half %x
+  %fneg.y = fneg half %y
+  %result = call half @llvm.minimumnum.f16(half %fneg.x, half %fneg.y)
+  ret half %result
+}
+
+define double @v_minimumnum_f64_fneg(double %x, double %y) {
+; GFX8-LABEL: v_minimumnum_f64_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX10-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX11-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], -v[2:3], -v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg double %x
+  %fneg.y = fneg double %y
+  %result = call double @llvm.minimumnum.f64(double %fneg.x, double %fneg.y)
+  ret double %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir b/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
index 4ff43024ae8cc..0c9f628dfbb2c 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
@@ -1,6 +1,9 @@
 # RUN: llc -mtriple=amdgcn -mcpu=kaveri -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=kaveri -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
 
 # GCN-LABEL: {{^}}name: add_shr_i32
 # GCN: [[SMOV:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 123
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir b/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
index ef986f8c9d2a3..0ad1b5527c854 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
@@ -1,5 +1,7 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s
 
 # test for 3 consecutive _sdwa's
 # GFX9-LABEL: name:            test1_add_co_sdwa
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index 4ca39ecc7a0ae..ffbd2d092b5d8 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDWA %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDWA %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=SDWA %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=SDWA %s
 ---
 name:            add_f16_u32_preserve
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/Mips/is_fpclass.ll b/llvm/test/CodeGen/Mips/is_fpclass.ll
new file mode 100644
index 0000000000000..9454a064c5312
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/is_fpclass.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=mipsisa32r6-unknown-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+
+
+define i1 @isnan_float(float %x) nounwind {
+; CHECK-LABEL: isnan_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 3
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3)  ; nan
+  ret i1 %1
+}
+
+define i1 @isnan_double(double %x) nounwind {
+; CHECK-LABEL: isnan_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 3
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3)  ; nan
+  ret i1 %1
+}
+
+define i1 @isnan_float_strictfp(float %x) strictfp nounwind {
+; CHECK-LABEL: isnan_float_strictfp:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 3
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3) strictfp ; nan
+  ret i1 %1
+}
+
+define i1 @isnan_double_strictfp(double %x) strictfp nounwind {
+; CHECK-LABEL: isnan_double_strictfp:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 3
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3) strictfp ; nan
+  ret i1 %1
+}
+
+define i1 @isinf_float(float %x) nounwind {
+; CHECK-LABEL: isinf_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 68
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; 0x204 = "inf"
+  ret i1 %1
+}
+
+define i1 @isfinite_float(float %x) nounwind {
+; CHECK-LABEL: isfinite_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 952
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; 0x1f8 = "finite"
+  ret i1 %1
+}
+
+define i1 @isnormal_float(float %x) nounwind {
+; CHECK-LABEL: isnormal_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 136
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 264)  ; 0x108 = "normal"
+  ret i1 %1
+}
+
+define i1 @issubnormal_float(float %x) nounwind {
+; CHECK-LABEL: issubnormal_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 272
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 144)  ; 0x90 = "subnormal"
+  ret i1 %1
+}
+
+define i1 @iszero_float(float %x) nounwind {
+; CHECK-LABEL: iszero_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 544
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 96)  ; 0x60 = "zero"
+  ret i1 %1
+}
+
+define i1 @issnan_float(float %x) nounwind {
+; CHECK-LABEL: issnan_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 1
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 1)
+  ret i1 %1
+}
+
+define i1 @issnan_double(double %x) nounwind {
+; CHECK-LABEL: issnan_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 1
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 1)
+  ret i1 %1
+}
+
+define i1 @isqnan_float(float %x) nounwind {
+; CHECK-LABEL: isqnan_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 2
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 2)
+  ret i1 %1
+}
+
+define i1 @isqnan_double(double %x) nounwind {
+; CHECK-LABEL: isqnan_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 2
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 2)
+  ret i1 %1
+}
+
+define i1 @isposzero_double(double %x) nounwind {
+; CHECK-LABEL: isposzero_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 512
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 64)
+  ret i1 %1
+}
+
+define i1 @isnegzero_double(double %x) nounwind {
+; CHECK-LABEL: isnegzero_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 32
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 32)
+  ret i1 %1
+}
+
+define i1 @isposnormal_double(double %x) nounwind {
+; CHECK-LABEL: isposnormal_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 128
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 256)
+  ret i1 %1
+}
+
+define i1 @isnegnormal_double(double %x) nounwind {
+; CHECK-LABEL: isnegnormal_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 8
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 8)
+  ret i1 %1
+}
+
+define i1 @isnormal_double(double %x) nounwind {
+; CHECK-LABEL: isnormal_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 136
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 264)
+  ret i1 %1
+}
+
+define i1 @isclass_00d_double(double %x) nounwind {
+; CHECK-LABEL: isclass_00d_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 13
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 13)
+  ret i1 %1
+}
+
+define i1 @isclass_1c0_float(float %x) nounwind {
+; CHECK-LABEL: isclass_1c0_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 896
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 448)
+  ret i1 %1
+}
+
+declare i1 @llvm.is.fpclass.f32(float, i32)
+declare i1 @llvm.is.fpclass.f64(double, i32)
diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index 86b36d8f69e95..919214b0e9a8d 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -2341,6 +2341,410 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind {
   ret i32 %abs
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; RV32I-LABEL: abd_select_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a1, 24
+; RV32I-NEXT:    srai a2, a2, 24
+; RV32I-NEXT:    slli a3, a0, 24
+; RV32I-NEXT:    srai a3, a3, 24
+; RV32I-NEXT:    blt a3, a2, .LBB34_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB34_2:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a2, a1, 56
+; RV64I-NEXT:    srai a2, a2, 56
+; RV64I-NEXT:    slli a3, a0, 56
+; RV64I-NEXT:    srai a3, a3, 56
+; RV64I-NEXT:    blt a3, a2, .LBB34_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB34_2:
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; ZBB-LABEL: abd_select_i8:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    sext.b a1, a1
+; ZBB-NEXT:    sext.b a0, a0
+; ZBB-NEXT:    min a2, a0, a1
+; ZBB-NEXT:    max a0, a0, a1
+; ZBB-NEXT:    sub a0, a0, a2
+; ZBB-NEXT:    ret
+  %cmp = icmp slt i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; RV32I-LABEL: abd_select_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srai a2, a2, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    srai a3, a3, 16
+; RV32I-NEXT:    bge a3, a2, .LBB35_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB35_2:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a2, a0, 48
+; RV64I-NEXT:    srai a2, a2, 48
+; RV64I-NEXT:    slli a3, a1, 48
+; RV64I-NEXT:    srai a3, a3, 48
+; RV64I-NEXT:    bge a3, a2, .LBB35_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB35_2:
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; ZBB-LABEL: abd_select_i16:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    sext.h a1, a1
+; ZBB-NEXT:    sext.h a0, a0
+; ZBB-NEXT:    min a2, a0, a1
+; ZBB-NEXT:    max a0, a0, a1
+; ZBB-NEXT:    sub a0, a0, a2
+; ZBB-NEXT:    ret
+  %cmp = icmp sle i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: abd_select_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    blt a1, a0, .LBB36_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB36_2:
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    sext.w a3, a1
+; RV64I-NEXT:    blt a3, a2, .LBB36_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB36_2:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    min a2, a0, a1
+; RV32ZBB-NEXT:    max a0, a0, a1
+; RV32ZBB-NEXT:    sub a0, a0, a2
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.w a1, a1
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    min a2, a0, a1
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: abd_select_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB37_3
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a1, a3
+; RV32I-NEXT:    bnez a4, .LBB37_4
+; RV32I-NEXT:  .LBB37_2:
+; RV32I-NEXT:    mv a4, a1
+; RV32I-NEXT:    mv a5, a0
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    j .LBB37_5
+; RV32I-NEXT:  .LBB37_3:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    beqz a4, .LBB37_2
+; RV32I-NEXT:  .LBB37_4:
+; RV32I-NEXT:    mv a4, a3
+; RV32I-NEXT:    mv a5, a2
+; RV32I-NEXT:  .LBB37_5:
+; RV32I-NEXT:    sltu a2, a5, a0
+; RV32I-NEXT:    sub a1, a4, a1
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a0, a5, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bge a0, a1, .LBB37_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB37_2:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    sltu a4, a2, a0
+; RV32ZBB-NEXT:    mv a5, a4
+; RV32ZBB-NEXT:    beq a1, a3, .LBB37_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt a5, a3, a1
+; RV32ZBB-NEXT:  .LBB37_2:
+; RV32ZBB-NEXT:    bnez a5, .LBB37_4
+; RV32ZBB-NEXT:  # %bb.3:
+; RV32ZBB-NEXT:    sub a1, a3, a1
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sub a0, a2, a0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB37_4:
+; RV32ZBB-NEXT:    sltu a4, a0, a2
+; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sub a0, a0, a2
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    min a2, a0, a1
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp sge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; RV32I-LABEL: abd_select_i128:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lw a7, 4(a2)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw t0, 12(a2)
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    beq a5, t0, .LBB38_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt t1, a5, t0
+; RV32I-NEXT:    j .LBB38_3
+; RV32I-NEXT:  .LBB38_2:
+; RV32I-NEXT:    sltu t1, a4, a6
+; RV32I-NEXT:  .LBB38_3:
+; RV32I-NEXT:    lw t3, 0(a2)
+; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    beq a3, a7, .LBB38_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    sltu a2, a3, a7
+; RV32I-NEXT:    j .LBB38_6
+; RV32I-NEXT:  .LBB38_5:
+; RV32I-NEXT:    sltu a2, a1, t3
+; RV32I-NEXT:  .LBB38_6:
+; RV32I-NEXT:    xor t2, a5, t0
+; RV32I-NEXT:    xor t4, a4, a6
+; RV32I-NEXT:    or t2, t4, t2
+; RV32I-NEXT:    beqz t2, .LBB38_8
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    mv a2, t1
+; RV32I-NEXT:  .LBB38_8:
+; RV32I-NEXT:    bnez a2, .LBB38_10
+; RV32I-NEXT:  # %bb.9:
+; RV32I-NEXT:    mv a2, t3
+; RV32I-NEXT:    mv t1, a7
+; RV32I-NEXT:    mv t4, t0
+; RV32I-NEXT:    mv t2, a6
+; RV32I-NEXT:    j .LBB38_11
+; RV32I-NEXT:  .LBB38_10:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv t1, a3
+; RV32I-NEXT:    mv t4, a5
+; RV32I-NEXT:    mv t2, a4
+; RV32I-NEXT:    mv a1, t3
+; RV32I-NEXT:    mv a3, a7
+; RV32I-NEXT:    mv a5, t0
+; RV32I-NEXT:    mv a4, a6
+; RV32I-NEXT:  .LBB38_11:
+; RV32I-NEXT:    sltu a6, a4, t2
+; RV32I-NEXT:    sub a7, a5, t4
+; RV32I-NEXT:    sltu a5, a1, a2
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    mv a7, a5
+; RV32I-NEXT:    beq a3, t1, .LBB38_13
+; RV32I-NEXT:  # %bb.12:
+; RV32I-NEXT:    sltu a7, a3, t1
+; RV32I-NEXT:  .LBB38_13:
+; RV32I-NEXT:    sub a4, a4, t2
+; RV32I-NEXT:    sltu t0, a4, a7
+; RV32I-NEXT:    sub a6, a6, t0
+; RV32I-NEXT:    sub a4, a4, a7
+; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    sub a3, a3, a5
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a6, 12(a0)
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i128:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beq a1, a3, .LBB38_3
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slt a4, a1, a3
+; RV64I-NEXT:    beqz a4, .LBB38_4
+; RV64I-NEXT:  .LBB38_2:
+; RV64I-NEXT:    mv a4, a1
+; RV64I-NEXT:    mv a5, a0
+; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    j .LBB38_5
+; RV64I-NEXT:  .LBB38_3:
+; RV64I-NEXT:    sltu a4, a0, a2
+; RV64I-NEXT:    bnez a4, .LBB38_2
+; RV64I-NEXT:  .LBB38_4:
+; RV64I-NEXT:    mv a4, a3
+; RV64I-NEXT:    mv a5, a2
+; RV64I-NEXT:  .LBB38_5:
+; RV64I-NEXT:    sltu a2, a0, a5
+; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    sub a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a5
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i128:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a4, 4(a1)
+; RV32ZBB-NEXT:    lw a6, 8(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a1, 4(a2)
+; RV32ZBB-NEXT:    sltu a2, a7, a6
+; RV32ZBB-NEXT:    mv t4, a2
+; RV32ZBB-NEXT:    beq t0, t1, .LBB38_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt t4, t1, t0
+; RV32ZBB-NEXT:  .LBB38_2:
+; RV32ZBB-NEXT:    sltu t2, a5, a3
+; RV32ZBB-NEXT:    sltu t5, a1, a4
+; RV32ZBB-NEXT:    mv t3, t2
+; RV32ZBB-NEXT:    beq a4, a1, .LBB38_4
+; RV32ZBB-NEXT:  # %bb.3:
+; RV32ZBB-NEXT:    mv t3, t5
+; RV32ZBB-NEXT:  .LBB38_4:
+; RV32ZBB-NEXT:    addi sp, sp, -16
+; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZBB-NEXT:    xor t6, t0, t1
+; RV32ZBB-NEXT:    xor s0, a6, a7
+; RV32ZBB-NEXT:    or t6, s0, t6
+; RV32ZBB-NEXT:    beqz t6, .LBB38_6
+; RV32ZBB-NEXT:  # %bb.5:
+; RV32ZBB-NEXT:    mv t3, t4
+; RV32ZBB-NEXT:  .LBB38_6:
+; RV32ZBB-NEXT:    mv t4, t2
+; RV32ZBB-NEXT:    beq a1, a4, .LBB38_8
+; RV32ZBB-NEXT:  # %bb.7:
+; RV32ZBB-NEXT:    mv t4, t5
+; RV32ZBB-NEXT:  .LBB38_8:
+; RV32ZBB-NEXT:    sltu t5, a3, a5
+; RV32ZBB-NEXT:    mv t6, t5
+; RV32ZBB-NEXT:    beq a4, a1, .LBB38_10
+; RV32ZBB-NEXT:  # %bb.9:
+; RV32ZBB-NEXT:    sltu t6, a4, a1
+; RV32ZBB-NEXT:  .LBB38_10:
+; RV32ZBB-NEXT:    bnez t3, .LBB38_12
+; RV32ZBB-NEXT:  # %bb.11:
+; RV32ZBB-NEXT:    sub t0, t1, t0
+; RV32ZBB-NEXT:    sub a6, a7, a6
+; RV32ZBB-NEXT:    sub a2, t0, a2
+; RV32ZBB-NEXT:    sltu a7, a6, t4
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a3, a5, a3
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, a6, t4
+; RV32ZBB-NEXT:    j .LBB38_13
+; RV32ZBB-NEXT:  .LBB38_12:
+; RV32ZBB-NEXT:    sltu a2, a6, a7
+; RV32ZBB-NEXT:    sub t0, t0, t1
+; RV32ZBB-NEXT:    sub a2, t0, a2
+; RV32ZBB-NEXT:    sub a6, a6, a7
+; RV32ZBB-NEXT:    sltu a7, a6, t6
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a3, a3, a5
+; RV32ZBB-NEXT:    sub a4, a4, a1
+; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a4, a6, t6
+; RV32ZBB-NEXT:  .LBB38_13:
+; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a3, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZBB-NEXT:    addi sp, sp, 16
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i128:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sltu a4, a2, a0
+; RV64ZBB-NEXT:    mv a5, a4
+; RV64ZBB-NEXT:    beq a1, a3, .LBB38_2
+; RV64ZBB-NEXT:  # %bb.1:
+; RV64ZBB-NEXT:    slt a5, a3, a1
+; RV64ZBB-NEXT:  .LBB38_2:
+; RV64ZBB-NEXT:    bnez a5, .LBB38_4
+; RV64ZBB-NEXT:  # %bb.3:
+; RV64ZBB-NEXT:    sub a1, a3, a1
+; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    sub a0, a2, a0
+; RV64ZBB-NEXT:    ret
+; RV64ZBB-NEXT:  .LBB38_4:
+; RV64ZBB-NEXT:    sltu a4, a0, a2
+; RV64ZBB-NEXT:    sub a1, a1, a3
+; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp slt i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
 
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
diff --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll
index 14f45895754df..a9f933243f679 100644
--- a/llvm/test/CodeGen/RISCV/abdu.ll
+++ b/llvm/test/CodeGen/RISCV/abdu.ll
@@ -1720,6 +1720,398 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
   ret i128 %sel
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abdu(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; NOZBB-LABEL: abd_select_i8:
+; NOZBB:       # %bb.0:
+; NOZBB-NEXT:    andi a2, a1, 255
+; NOZBB-NEXT:    andi a3, a0, 255
+; NOZBB-NEXT:    bltu a3, a2, .LBB23_2
+; NOZBB-NEXT:  # %bb.1:
+; NOZBB-NEXT:    sub a0, a0, a1
+; NOZBB-NEXT:    ret
+; NOZBB-NEXT:  .LBB23_2:
+; NOZBB-NEXT:    sub a0, a1, a0
+; NOZBB-NEXT:    ret
+;
+; ZBB-LABEL: abd_select_i8:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    andi a1, a1, 255
+; ZBB-NEXT:    andi a0, a0, 255
+; ZBB-NEXT:    minu a2, a0, a1
+; ZBB-NEXT:    maxu a0, a0, a1
+; ZBB-NEXT:    sub a0, a0, a2
+; ZBB-NEXT:    ret
+  %cmp = icmp ult i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; RV32I-LABEL: abd_select_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    addi a2, a2, -1
+; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    and a2, a1, a2
+; RV32I-NEXT:    bgeu a2, a3, .LBB24_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB24_2:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    and a2, a1, a2
+; RV64I-NEXT:    bgeu a2, a3, .LBB24_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB24_2:
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; ZBB-LABEL: abd_select_i16:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    zext.h a1, a1
+; ZBB-NEXT:    zext.h a0, a0
+; ZBB-NEXT:    minu a2, a0, a1
+; ZBB-NEXT:    maxu a0, a0, a1
+; ZBB-NEXT:    sub a0, a0, a2
+; ZBB-NEXT:    ret
+  %cmp = icmp ule i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: abd_select_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    bltu a1, a0, .LBB25_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB25_2:
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    sext.w a3, a1
+; RV64I-NEXT:    bltu a3, a2, .LBB25_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB25_2:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    minu a2, a0, a1
+; RV32ZBB-NEXT:    maxu a0, a0, a1
+; RV32ZBB-NEXT:    sub a0, a0, a2
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    slli a1, a1, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
+; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a0, a0, 32
+; RV64ZBB-NEXT:    minu a2, a0, a1
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: abd_select_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB26_3
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a1, a3
+; RV32I-NEXT:    bnez a4, .LBB26_4
+; RV32I-NEXT:  .LBB26_2:
+; RV32I-NEXT:    mv a4, a1
+; RV32I-NEXT:    mv a5, a0
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    j .LBB26_5
+; RV32I-NEXT:  .LBB26_3:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    beqz a4, .LBB26_2
+; RV32I-NEXT:  .LBB26_4:
+; RV32I-NEXT:    mv a4, a3
+; RV32I-NEXT:    mv a5, a2
+; RV32I-NEXT:  .LBB26_5:
+; RV32I-NEXT:    sltu a2, a5, a0
+; RV32I-NEXT:    sub a1, a4, a1
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a0, a5, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bgeu a0, a1, .LBB26_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB26_2:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    sltu a4, a0, a2
+; RV32ZBB-NEXT:    sub a3, a1, a3
+; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sub a2, a0, a2
+; RV32ZBB-NEXT:    beq a3, a1, .LBB26_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu a0, a1, a3
+; RV32ZBB-NEXT:    j .LBB26_3
+; RV32ZBB-NEXT:  .LBB26_2:
+; RV32ZBB-NEXT:    sltu a0, a0, a2
+; RV32ZBB-NEXT:  .LBB26_3:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    xor a2, a2, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a1
+; RV32ZBB-NEXT:    xor a1, a3, a1
+; RV32ZBB-NEXT:    add a1, a1, a0
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    minu a2, a0, a1
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp uge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; RV32I-LABEL: abd_select_i128:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lw a7, 4(a2)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw t0, 12(a2)
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    beq a5, t0, .LBB27_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu t1, a5, t0
+; RV32I-NEXT:    j .LBB27_3
+; RV32I-NEXT:  .LBB27_2:
+; RV32I-NEXT:    sltu t1, a4, a6
+; RV32I-NEXT:  .LBB27_3:
+; RV32I-NEXT:    lw t3, 0(a2)
+; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    beq a3, a7, .LBB27_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    sltu a2, a3, a7
+; RV32I-NEXT:    j .LBB27_6
+; RV32I-NEXT:  .LBB27_5:
+; RV32I-NEXT:    sltu a2, a1, t3
+; RV32I-NEXT:  .LBB27_6:
+; RV32I-NEXT:    xor t2, a5, t0
+; RV32I-NEXT:    xor t4, a4, a6
+; RV32I-NEXT:    or t2, t4, t2
+; RV32I-NEXT:    beqz t2, .LBB27_8
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    mv a2, t1
+; RV32I-NEXT:  .LBB27_8:
+; RV32I-NEXT:    bnez a2, .LBB27_10
+; RV32I-NEXT:  # %bb.9:
+; RV32I-NEXT:    mv a2, t3
+; RV32I-NEXT:    mv t1, a7
+; RV32I-NEXT:    mv t4, t0
+; RV32I-NEXT:    mv t2, a6
+; RV32I-NEXT:    j .LBB27_11
+; RV32I-NEXT:  .LBB27_10:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv t1, a3
+; RV32I-NEXT:    mv t4, a5
+; RV32I-NEXT:    mv t2, a4
+; RV32I-NEXT:    mv a1, t3
+; RV32I-NEXT:    mv a3, a7
+; RV32I-NEXT:    mv a5, t0
+; RV32I-NEXT:    mv a4, a6
+; RV32I-NEXT:  .LBB27_11:
+; RV32I-NEXT:    sltu a6, a4, t2
+; RV32I-NEXT:    sub a7, a5, t4
+; RV32I-NEXT:    sltu a5, a1, a2
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    mv a7, a5
+; RV32I-NEXT:    beq a3, t1, .LBB27_13
+; RV32I-NEXT:  # %bb.12:
+; RV32I-NEXT:    sltu a7, a3, t1
+; RV32I-NEXT:  .LBB27_13:
+; RV32I-NEXT:    sub a4, a4, t2
+; RV32I-NEXT:    sltu t0, a4, a7
+; RV32I-NEXT:    sub a6, a6, t0
+; RV32I-NEXT:    sub a4, a4, a7
+; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    sub a3, a3, a5
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a6, 12(a0)
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i128:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beq a1, a3, .LBB27_3
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sltu a4, a1, a3
+; RV64I-NEXT:    beqz a4, .LBB27_4
+; RV64I-NEXT:  .LBB27_2:
+; RV64I-NEXT:    mv a4, a1
+; RV64I-NEXT:    mv a5, a0
+; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    j .LBB27_5
+; RV64I-NEXT:  .LBB27_3:
+; RV64I-NEXT:    sltu a4, a0, a2
+; RV64I-NEXT:    bnez a4, .LBB27_2
+; RV64I-NEXT:  .LBB27_4:
+; RV64I-NEXT:    mv a4, a3
+; RV64I-NEXT:    mv a5, a2
+; RV64I-NEXT:  .LBB27_5:
+; RV64I-NEXT:    sltu a2, a0, a5
+; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    sub a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a5
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i128:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a1)
+; RV32ZBB-NEXT:    lw t0, 4(a2)
+; RV32ZBB-NEXT:    lw a1, 4(a1)
+; RV32ZBB-NEXT:    sltu a2, a4, a7
+; RV32ZBB-NEXT:    sub t1, a6, t1
+; RV32ZBB-NEXT:    sltu t2, a3, a5
+; RV32ZBB-NEXT:    sub a2, t1, a2
+; RV32ZBB-NEXT:    mv t1, t2
+; RV32ZBB-NEXT:    beq a1, t0, .LBB27_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu t1, a1, t0
+; RV32ZBB-NEXT:  .LBB27_2:
+; RV32ZBB-NEXT:    sub a7, a4, a7
+; RV32ZBB-NEXT:    sltu t3, a7, t1
+; RV32ZBB-NEXT:    sub a2, a2, t3
+; RV32ZBB-NEXT:    sub a7, a7, t1
+; RV32ZBB-NEXT:    beq a2, a6, .LBB27_4
+; RV32ZBB-NEXT:  # %bb.3:
+; RV32ZBB-NEXT:    sltu t1, a6, a2
+; RV32ZBB-NEXT:    j .LBB27_5
+; RV32ZBB-NEXT:  .LBB27_4:
+; RV32ZBB-NEXT:    sltu t1, a4, a7
+; RV32ZBB-NEXT:  .LBB27_5:
+; RV32ZBB-NEXT:    sub t0, a1, t0
+; RV32ZBB-NEXT:    sub t0, t0, t2
+; RV32ZBB-NEXT:    sub a5, a3, a5
+; RV32ZBB-NEXT:    beq t0, a1, .LBB27_7
+; RV32ZBB-NEXT:  # %bb.6:
+; RV32ZBB-NEXT:    sltu a1, a1, t0
+; RV32ZBB-NEXT:    j .LBB27_8
+; RV32ZBB-NEXT:  .LBB27_7:
+; RV32ZBB-NEXT:    sltu a1, a3, a5
+; RV32ZBB-NEXT:  .LBB27_8:
+; RV32ZBB-NEXT:    xor a3, a2, a6
+; RV32ZBB-NEXT:    xor a4, a7, a4
+; RV32ZBB-NEXT:    or a3, a4, a3
+; RV32ZBB-NEXT:    beqz a3, .LBB27_10
+; RV32ZBB-NEXT:  # %bb.9:
+; RV32ZBB-NEXT:    mv a1, t1
+; RV32ZBB-NEXT:  .LBB27_10:
+; RV32ZBB-NEXT:    neg a6, a1
+; RV32ZBB-NEXT:    xor a3, a7, a6
+; RV32ZBB-NEXT:    sltu a4, a3, a6
+; RV32ZBB-NEXT:    xor a2, a2, a6
+; RV32ZBB-NEXT:    add a2, a2, a1
+; RV32ZBB-NEXT:    sub a4, a2, a4
+; RV32ZBB-NEXT:    xor a2, a5, a6
+; RV32ZBB-NEXT:    sltu a5, a2, a6
+; RV32ZBB-NEXT:    xor a7, t0, a6
+; RV32ZBB-NEXT:    mv t1, a5
+; RV32ZBB-NEXT:    beqz t0, .LBB27_12
+; RV32ZBB-NEXT:  # %bb.11:
+; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:  .LBB27_12:
+; RV32ZBB-NEXT:    add a3, a3, a1
+; RV32ZBB-NEXT:    sltu a6, a3, t1
+; RV32ZBB-NEXT:    sub a4, a4, a6
+; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    add a7, a7, a1
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    sw a1, 0(a0)
+; RV32ZBB-NEXT:    sw a5, 4(a0)
+; RV32ZBB-NEXT:    sw a3, 8(a0)
+; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i128:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sltu a4, a0, a2
+; RV64ZBB-NEXT:    sub a3, a1, a3
+; RV64ZBB-NEXT:    sub a3, a3, a4
+; RV64ZBB-NEXT:    sub a2, a0, a2
+; RV64ZBB-NEXT:    beq a3, a1, .LBB27_2
+; RV64ZBB-NEXT:  # %bb.1:
+; RV64ZBB-NEXT:    sltu a0, a1, a3
+; RV64ZBB-NEXT:    j .LBB27_3
+; RV64ZBB-NEXT:  .LBB27_2:
+; RV64ZBB-NEXT:    sltu a0, a0, a2
+; RV64ZBB-NEXT:  .LBB27_3:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    xor a2, a2, a1
+; RV64ZBB-NEXT:    sltu a4, a2, a1
+; RV64ZBB-NEXT:    xor a1, a3, a1
+; RV64ZBB-NEXT:    add a1, a1, a0
+; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    add a0, a2, a0
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp ult i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
+
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
 declare i32 @llvm.abs.i32(i32, i1)
@@ -1737,4 +2129,3 @@ declare i32 @llvm.umin.i32(i32, i32)
 declare i64 @llvm.umin.i64(i64, i64)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}
-; NOZBB: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll b/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
index 3d367ddc59bca..5d588ad66b9ca 100644
--- a/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
+++ b/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
@@ -19,7 +19,7 @@ define signext i32 @sum(ptr %a, i32 signext %n, i1 %prof.min.iters.check, <vscal
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_4: # %vector.ph
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
 ; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 62595fd4a7ad6..19d7324eeff4a 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -569,6 +569,60 @@ define i64 @addmul72(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @mul50(i64 %a) {
+; CHECK-LABEL: mul50:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 50
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 50
+  ret i64 %c
+}
+
+define i64 @addmul50(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul50:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 50
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 50
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul100(i64 %a) {
+; CHECK-LABEL: mul100:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 100
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 100
+  ret i64 %c
+}
+
+define i64 @addmul100(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul100:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 100
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 100
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul162(i64 %a) {
+; CHECK-LABEL: mul162:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 162
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 162
+  ret i64 %c
+}
+
 define i64 @addmul162(i64 %a, i64 %b) {
 ; CHECK-LABEL: addmul162:
 ; CHECK:       # %bb.0:
@@ -581,6 +635,16 @@ define i64 @addmul162(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @mul180(i64 %a) {
+; CHECK-LABEL: mul180:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 180
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 180
+  ret i64 %c
+}
+
 define i64 @addmul180(i64 %a, i64 %b) {
 ; CHECK-LABEL: addmul180:
 ; CHECK:       # %bb.0:
@@ -605,6 +669,27 @@ define i64 @add255mul180(i64 %a) {
   ret i64 %d
 }
 
+define i64 @mul200(i64 %a) {
+; CHECK-LABEL: mul200:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 200
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 200
+  ret i64 %c
+}
+
+define i64 @addmul200(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul200:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 200
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 200
+  %d = add i64 %c, %b
+  ret i64 %d
+}
 
 define i64 @addmul4096(i64 %a, i64 %b) {
 ; CHECK-LABEL: addmul4096:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 01aac122d5957..7031f93edc2c3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -2022,14 +2022,9 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
-; RV32-NEXT:    vmv1r.v v24, v0
+; RV32-NEXT:    vmv1r.v v7, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -2045,7 +2040,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
@@ -2053,67 +2048,53 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    lui a3, 349525
 ; RV32-NEXT:    addi a3, a3, 1365
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a3, 61681
@@ -2121,25 +2102,26 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a3, 4112
 ; RV32-NEXT:    addi a3, a3, 257
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v16, a2, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
@@ -2149,8 +2131,8 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB46_2:
-; RV32-NEXT:    vmv1r.v v0, v24
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -2160,71 +2142,64 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -2386,23 +2361,23 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vsub.vv v24, v16, v24
+; RV32-NEXT:    vsub.vv v16, v16, v24
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v24, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v24, v16, v0
+; RV32-NEXT:    vsrl.vi v16, v16, 2
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vadd.vv v24, v16, v24
-; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    addi a3, a3, -241
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
@@ -2437,16 +2412,16 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vsub.vv v24, v8, v24
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index 0ef0a431dabc4..d36240e493e41 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -2266,7 +2266,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
@@ -2283,12 +2283,18 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
@@ -2297,55 +2303,51 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT:    lui a4, 209715
+; RV32-NEXT:    addi a4, a4, 819
+; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 209715
-; RV32-NEXT:    addi a4, a4, 819
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a4, 61681
@@ -2353,26 +2355,30 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a4, 4112
 ; RV32-NEXT:    addi a4, a4, 257
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v16, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    vsrl.vx v8, v16, a3, v0.t
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vsrl.vx v8, v8, a3, v0.t
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB46_2
 ; RV32-NEXT:  # %bb.1:
@@ -2382,40 +2388,32 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
@@ -2442,6 +2440,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
@@ -2449,12 +2448,6 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
@@ -2465,19 +2458,23 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 56
@@ -2608,15 +2605,9 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 5
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    sub a2, a0, a1
 ; RV32-NEXT:    sltu a3, a0, a2
@@ -2624,22 +2615,22 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    and a3, a3, a2
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v16, a2
+; RV32-NEXT:    vsub.vx v24, v16, a2
 ; RV32-NEXT:    vnot.v v16, v16
-; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsrl.vi v24, v16, 1
 ; RV32-NEXT:    lui a4, 349525
 ; RV32-NEXT:    addi a4, a4, 1365
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v0, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v24, v8
+; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    vsub.vv v16, v16, v24
 ; RV32-NEXT:    lui a4, 209715
 ; RV32-NEXT:    addi a4, a4, 819
@@ -2648,6 +2639,11 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v16, v0
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v0, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vadd.vv v16, v24, v16
 ; RV32-NEXT:    vsrl.vi v24, v16, 4
@@ -2655,50 +2651,46 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    addi a4, a4, -241
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v24, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    slli a4, a4, 3
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    lui a4, 4112
 ; RV32-NEXT:    addi a4, a4, 257
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.v.x v24, a4
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a3
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v16, v16, a3
 ; RV32-NEXT:    bltu a0, a1, .LBB47_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB47_2:
-; RV32-NEXT:    slli a1, a1, 5
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v24, a2
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vand.vv v8, v24, v8
+; RV32-NEXT:    vsub.vx v24, v8, a2
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v24, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v0
@@ -2706,23 +2698,17 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vx v8, v8, a3
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vmul.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vx v8, v8, a3
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index bc3e135a588a6..eff56e408d6d5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -159,296 +159,308 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 84
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 80 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 84 * vlenb
 ; RV32-NEXT:    addi a3, a1, 256
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v16, (a3)
+; RV32-NEXT:    vle32.v v8, (a3)
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 6
+; RV32-NEXT:    li a4, 76
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a1, 128
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vslideup.vi v8, v16, 4
+; RV32-NEXT:    vslideup.vi v4, v8, 4
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v4, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 12
 ; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 56
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v3, v0
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vslideup.vi v8, v16, 10, v0.t
+; RV32-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 16
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 44
+; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vslideup.vi v4, v8, 10, v0.t
 ; RV32-NEXT:    lui a4, %hi(.LCPI6_0)
 ; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v8, (a4)
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v0, (a4)
 ; RV32-NEXT:    lui a4, %hi(.LCPI6_1)
 ; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_1)
 ; RV32-NEXT:    lui a5, 1
 ; RV32-NEXT:    vle16.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a6, 24
+; RV32-NEXT:    li a6, 56
 ; RV32-NEXT:    mul a4, a4, a6
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 72
+; RV32-NEXT:    li a4, 68
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v24, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 60
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, a5, -64
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vmv.s.x v16, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v4
+; RV32-NEXT:    vs1r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
+; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v8, v16
+; RV32-NEXT:    vmv.v.v v4, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v12, v8, 2
-; RV32-NEXT:    vmv1r.v v8, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v3, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vl1r.v v1, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v12, v16, 8, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_3)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_3)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v0, (a1)
-; RV32-NEXT:    vle16.v v4, (a3)
+; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    vle16.v v8, (a3)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_4)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_4)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v10, (a1)
+; RV32-NEXT:    vle16.v v2, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v0
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v12
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 60
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v4, v0.t
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v8, v4, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v12, v24
+; RV32-NEXT:    vmv.v.v v8, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v24, v10
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v2
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v24, 6, v0.t
+; RV32-NEXT:    vslideup.vi v8, v24, 6, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a3, 44
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v12, (a1)
-; RV32-NEXT:    vle16.v v8, (a3)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v24, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
 ; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v8, a1
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v0, v12
-; RV32-NEXT:    vmv1r.v v3, v8
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    li a3, 60
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 28
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_8)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_8)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vle16.v v16, (a1)
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_9)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_9)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v4, (a3)
-; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    vle16.v v8, (a3)
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 2
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs4r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v24, v8
+; RV32-NEXT:    vrgatherei16.vv v12, v8, v16
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv4r.v v24, v16
 ; RV32-NEXT:    vslideup.vi v12, v16, 4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
-; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v20
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
+; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
@@ -461,48 +473,51 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a1, 15
 ; RV32-NEXT:    vmv.s.x v3, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v8, v16, 6
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v8, v24, 6
 ; RV32-NEXT:    vmv1r.v v0, v3
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v12, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v12, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv4r.v v24, v16
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_11)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_11)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_12)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_12)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v24, (a1)
+; RV32-NEXT:    vle16.v v28, (a1)
 ; RV32-NEXT:    vle16.v v4, (a3)
 ; RV32-NEXT:    li a1, 1008
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 60
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -511,14 +526,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_14)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_14)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v20, (a1)
+; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_15)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_15)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v24, (a3)
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vle16.v v28, (a3)
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 40
@@ -526,21 +541,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v20, v0.t
+; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a3, 44
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 28
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -548,20 +558,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    li a2, 60
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -570,56 +580,57 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v28, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    li a2, 76
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv.v.v v28, v0
+; RV32-NEXT:    vmv.v.v v24, v0
 ; RV32-NEXT:    vmv.v.v v16, v8
 ; RV32-NEXT:    addi a1, a0, 320
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vse32.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    vse32.v v28, (a1)
-; RV32-NEXT:    addi a1, a0, 192
 ; RV32-NEXT:    vse32.v v24, (a1)
+; RV32-NEXT:    addi a1, a0, 192
+; RV32-NEXT:    vse32.v v28, (a1)
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 44
+; RV32-NEXT:    li a2, 36
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 80
+; RV32-NEXT:    li a1, 84
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
@@ -630,15 +641,15 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 74
+; RV64-NEXT:    li a3, 66
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xca, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 74 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 66 * vlenb
 ; RV64-NEXT:    addi a2, a1, 256
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v16, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 25
+; RV64-NEXT:    li a3, 21
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
@@ -646,76 +657,85 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi a2, a1, 128
 ; RV64-NEXT:    vle64.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a3, a1, 6
-; RV64-NEXT:    add a1, a3, a1
+; RV64-NEXT:    li a3, 57
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vrgather.vi v12, v16, 4
 ; RV64-NEXT:    li a1, 128
-; RV64-NEXT:    vmv.s.x v8, a1
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v16, 8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 49
+; RV64-NEXT:    li a3, 37
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v8
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgather.vi v12, v16, 2, v0.t
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vid.v v10
 ; RV64-NEXT:    li a1, 6
-; RV64-NEXT:    vmul.vx v2, v10, a1
+; RV64-NEXT:    vmul.vx v8, v10, a1
 ; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vle64.v v16, (a2)
+; RV64-NEXT:    vle64.v v24, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 57
+; RV64-NEXT:    li a3, 45
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v7, a1
-; RV64-NEXT:    vadd.vi v10, v2, -16
+; RV64-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v10, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v2
-; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vs1r.v v10, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vadd.vi v10, v8, -16
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 57
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vrgatherei16.vv v16, v0, v8
+; RV64-NEXT:    vmv2r.v v4, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl1r.v v6, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v6
 ; RV64-NEXT:    vrgatherei16.vv v16, v24, v10, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
 ; RV64-NEXT:    vmv.v.v v12, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
+; RV64-NEXT:    li a2, 21
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v12, v16, 5
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vmv1r.v v6, v8
+; RV64-NEXT:    vrgather.vi v12, v8, 5
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vl1r.v v1, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 49
+; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -723,19 +743,19 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vrgather.vi v12, v16, 3, v0.t
 ; RV64-NEXT:    vmv.v.v v28, v12
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v24, v2, 1
-; RV64-NEXT:    vadd.vi v26, v2, -15
+; RV64-NEXT:    vadd.vi v24, v4, 1
+; RV64-NEXT:    vadd.vi v26, v4, -15
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64-NEXT:    vrgatherei16.vv v16, v8, v24
-; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vmv1r.v v0, v6
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 45
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -744,8 +764,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
 ; RV64-NEXT:    vmv.v.v v28, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 4
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
@@ -755,7 +775,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vmv.v.i v9, 6
 ; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
+; RV64-NEXT:    li a2, 21
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -763,259 +783,253 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v12, v16, v9
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgatherei16.vv v12, v16, v10
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv4r.v v8, v16
 ; RV64-NEXT:    vrgather.vi v12, v16, 2
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 5
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgather.vi v12, v16, 3
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a1, 24
-; RV64-NEXT:    vmv.s.x v1, a1
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v24, v2, 2
-; RV64-NEXT:    vadd.vi v4, v2, -14
+; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v24
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64-NEXT:    vadd.vi v16, v4, 2
+; RV64-NEXT:    vadd.vi v2, v4, -14
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 57
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
+; RV64-NEXT:    li a2, 45
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v16, v2, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v6
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 49
+; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v20, v16, 4, v0.t
+; RV64-NEXT:    vrgather.vi v28, v24, 4, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv2r.v v8, v4
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v4, v2, 3
-; RV64-NEXT:    vadd.vi v8, v2, -13
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs2r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vadd.vi v4, v4, 3
+; RV64-NEXT:    vadd.vi v6, v8, -13
+; RV64-NEXT:    vmv2r.v v2, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v4
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v4
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl2r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v16, v6, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v6
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 49
+; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v24, 5, v0.t
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgather.vi v4, v16, 5, v0.t
 ; RV64-NEXT:    lui a1, 96
 ; RV64-NEXT:    li a2, 192
-; RV64-NEXT:    vmv.s.x v28, a2
+; RV64-NEXT:    vmv.s.x v1, a2
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a1
-; RV64-NEXT:    vmv1r.v v0, v28
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 5
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v12, v24, v8, v0.t
+; RV64-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 5
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a1, 28
 ; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v30, v2, 4
-; RV64-NEXT:    vadd.vi v6, v2, -12
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
+; RV64-NEXT:    slli a2, a1, 3
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v30
+; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64-NEXT:    vadd.vi v22, v2, 4
+; RV64-NEXT:    vadd.vi v20, v2, -12
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 57
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v6, v0.t
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v22
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV64-NEXT:    lui a1, 112
 ; RV64-NEXT:    addi a1, a1, 1
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v12, a1
-; RV64-NEXT:    vmv1r.v v0, v28
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v12, v0.t
+; RV64-NEXT:    vrgatherei16.vv v20, v16, v12, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
-; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v16, v24
-; RV64-NEXT:    vmv2r.v v8, v2
+; RV64-NEXT:    vmv.v.v v12, v24
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vadd.vi v12, v2, 5
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v24, v0, v12
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v12
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v2, v8, -11
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vi v12, v2, -11
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v24, v8, v2, v0.t
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
+; RV64-NEXT:    li a2, 45
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
+; RV64-NEXT:    vmv4r.v v12, v4
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
 ; RV64-NEXT:    vmv.v.v v12, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 5
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v20, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v20, v0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
@@ -1028,24 +1042,30 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi a1, a0, 192
 ; RV64-NEXT:    vse64.v v12, (a1)
 ; RV64-NEXT:    addi a1, a0, 128
-; RV64-NEXT:    vse64.v v16, (a1)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 53
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 64
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 4
-; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    li a3, 13
+; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 74
+; RV64-NEXT:    li a1, 66
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index 6e5ab436fc02d..a8798474d669a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -121,10 +121,9 @@ define i32 @reduce_sum_16xi32_prefix2(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -160,10 +159,9 @@ define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -183,10 +181,9 @@ define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -208,10 +205,9 @@ define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -263,10 +259,9 @@ define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 9, e32, m4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetivli zero, 9, e32, m4, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -294,10 +289,9 @@ define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix13:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 13, e32, m4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetivli zero, 13, e32, m4, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -334,10 +328,9 @@ define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix14:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 14, e32, m4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetivli zero, 14, e32, m4, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -375,10 +368,9 @@ define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix15:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 15, e32, m4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetivli zero, 15, e32, m4, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -499,10 +491,9 @@ define i32 @reduce_xor_16xi32_prefix2(ptr %p) {
 define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_xor_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredxor.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -537,7 +528,7 @@ define i32 @reduce_and_16xi32_prefix2(ptr %p) {
 define i32 @reduce_and_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_and_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, -1
@@ -576,10 +567,9 @@ define i32 @reduce_or_16xi32_prefix2(ptr %p) {
 define i32 @reduce_or_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_or_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredor.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -619,11 +609,10 @@ define i32 @reduce_smax_16xi32_prefix2(ptr %p) {
 define i32 @reduce_smax_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_smax_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a0, 524288
 ; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredmax.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -658,12 +647,11 @@ define i32 @reduce_smin_16xi32_prefix2(ptr %p) {
 define i32 @reduce_smin_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_smin_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a0, 524288
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredmin.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -698,10 +686,9 @@ define i32 @reduce_umax_16xi32_prefix2(ptr %p) {
 define i32 @reduce_umax_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_umax_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredmaxu.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -736,7 +723,7 @@ define i32 @reduce_umin_16xi32_prefix2(ptr %p) {
 define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
 ; RV32-LABEL: reduce_umin_16xi32_prefix5:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, -1
@@ -747,11 +734,10 @@ define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
 ;
 ; RV64-LABEL: reduce_umin_16xi32_prefix5:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    li a0, -1
 ; RV64-NEXT:    vmv.s.x v10, a0
-; RV64-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV64-NEXT:    vredminu.vs v8, v8, v10
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
@@ -787,11 +773,10 @@ define float @reduce_fadd_16xf32_prefix2(ptr %p) {
 define float @reduce_fadd_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_fadd_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a0, 524288
 ; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v8, v8, v10
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll
index 2b12249378eb1..4f58ccb5188d3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/remat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll
@@ -171,3 +171,339 @@ define void @vmv.v.i(ptr %p) {
   store volatile <vscale x 8 x i64> %vmv.v.i, ptr %p
   ret void
 }
+
+; The live range of %x needs extended down to the use of vmv.v.x at the end of
+; the block.
+define void @vmv.v.x_needs_extended(ptr %p, i64 %x) {
+; POSTRA-LABEL: vmv.v.x_needs_extended:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    addi sp, sp, -16
+; POSTRA-NEXT:    .cfi_def_cfa_offset 16
+; POSTRA-NEXT:    csrr a2, vlenb
+; POSTRA-NEXT:    slli a2, a2, 3
+; POSTRA-NEXT:    sub sp, sp, a2
+; POSTRA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; POSTRA-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; POSTRA-NEXT:    vmv.v.x v8, a1
+; POSTRA-NEXT:    addi a1, sp, 16
+; POSTRA-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vl8re64.v v16, (a0)
+; POSTRA-NEXT:    vl8re64.v v24, (a0)
+; POSTRA-NEXT:    vl8re64.v v0, (a0)
+; POSTRA-NEXT:    vl8re64.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v0, (a0)
+; POSTRA-NEXT:    vs8r.v v24, (a0)
+; POSTRA-NEXT:    vs8r.v v16, (a0)
+; POSTRA-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    csrr a0, vlenb
+; POSTRA-NEXT:    slli a0, a0, 3
+; POSTRA-NEXT:    add sp, sp, a0
+; POSTRA-NEXT:    addi sp, sp, 16
+; POSTRA-NEXT:    ret
+;
+; PRERA-LABEL: vmv.v.x_needs_extended:
+; PRERA:       # %bb.0:
+; PRERA-NEXT:    addi sp, sp, -16
+; PRERA-NEXT:    .cfi_def_cfa_offset 16
+; PRERA-NEXT:    csrr a2, vlenb
+; PRERA-NEXT:    slli a2, a2, 3
+; PRERA-NEXT:    sub sp, sp, a2
+; PRERA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; PRERA-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; PRERA-NEXT:    vmv.v.x v8, a1
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    addi a1, sp, 16
+; PRERA-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; PRERA-NEXT:    vl8re64.v v24, (a0)
+; PRERA-NEXT:    vl8re64.v v0, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v0, (a0)
+; PRERA-NEXT:    vs8r.v v24, (a0)
+; PRERA-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    csrr a0, vlenb
+; PRERA-NEXT:    slli a0, a0, 3
+; PRERA-NEXT:    add sp, sp, a0
+; PRERA-NEXT:    addi sp, sp, 16
+; PRERA-NEXT:    ret
+  %vmv.v.x = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(<vscale x 8 x i64> poison, i64 %x, i64 -1)
+  store volatile <vscale x 8 x i64> %vmv.v.x, ptr %p
+
+  %a = load volatile <vscale x 8 x i64>, ptr %p
+  %b = load volatile <vscale x 8 x i64>, ptr %p
+  %c = load volatile <vscale x 8 x i64>, ptr %p
+  %d = load volatile <vscale x 8 x i64>, ptr %p
+  store volatile <vscale x 8 x i64> %d, ptr %p
+  store volatile <vscale x 8 x i64> %c, ptr %p
+  store volatile <vscale x 8 x i64> %b, ptr %p
+  store volatile <vscale x 8 x i64> %a, ptr %p
+
+  store volatile <vscale x 8 x i64> %vmv.v.x, ptr %p
+  ret void
+}
+
+define void @vmv.v.x_live(ptr %p, i64 %x) {
+; POSTRA-LABEL: vmv.v.x_live:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; POSTRA-NEXT:    vmv.v.x v8, a1
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vl8re64.v v16, (a0)
+; POSTRA-NEXT:    vl8re64.v v24, (a0)
+; POSTRA-NEXT:    vl8re64.v v0, (a0)
+; POSTRA-NEXT:    vl8re64.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v0, (a0)
+; POSTRA-NEXT:    vs8r.v v24, (a0)
+; POSTRA-NEXT:    vs8r.v v16, (a0)
+; POSTRA-NEXT:    vmv.v.x v8, a1
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    sd a1, 0(a0)
+; POSTRA-NEXT:    ret
+;
+; PRERA-LABEL: vmv.v.x_live:
+; PRERA:       # %bb.0:
+; PRERA-NEXT:    addi sp, sp, -16
+; PRERA-NEXT:    .cfi_def_cfa_offset 16
+; PRERA-NEXT:    csrr a2, vlenb
+; PRERA-NEXT:    slli a2, a2, 3
+; PRERA-NEXT:    sub sp, sp, a2
+; PRERA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; PRERA-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; PRERA-NEXT:    vmv.v.x v8, a1
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    addi a2, sp, 16
+; PRERA-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; PRERA-NEXT:    vl8re64.v v24, (a0)
+; PRERA-NEXT:    vl8re64.v v0, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v0, (a0)
+; PRERA-NEXT:    vs8r.v v24, (a0)
+; PRERA-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    sd a1, 0(a0)
+; PRERA-NEXT:    csrr a0, vlenb
+; PRERA-NEXT:    slli a0, a0, 3
+; PRERA-NEXT:    add sp, sp, a0
+; PRERA-NEXT:    addi sp, sp, 16
+; PRERA-NEXT:    ret
+  %vmv.v.x = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(<vscale x 8 x i64> poison, i64 %x, i64 -1)
+  store volatile <vscale x 8 x i64> %vmv.v.x, ptr %p
+
+  %a = load volatile <vscale x 8 x i64>, ptr %p
+  %b = load volatile <vscale x 8 x i64>, ptr %p
+  %c = load volatile <vscale x 8 x i64>, ptr %p
+  %d = load volatile <vscale x 8 x i64>, ptr %p
+  store volatile <vscale x 8 x i64> %d, ptr %p
+  store volatile <vscale x 8 x i64> %c, ptr %p
+  store volatile <vscale x 8 x i64> %b, ptr %p
+  store volatile <vscale x 8 x i64> %a, ptr %p
+
+  store volatile <vscale x 8 x i64> %vmv.v.x, ptr %p
+  store volatile i64 %x, ptr %p
+  ret void
+}
+
+define void @vfmv.v.f(ptr %p, double %x) {
+; POSTRA-LABEL: vfmv.v.f:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; POSTRA-NEXT:    vfmv.v.f v8, fa0
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vl8re64.v v16, (a0)
+; POSTRA-NEXT:    vl8re64.v v24, (a0)
+; POSTRA-NEXT:    vl8re64.v v0, (a0)
+; POSTRA-NEXT:    vl8re64.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v0, (a0)
+; POSTRA-NEXT:    vs8r.v v24, (a0)
+; POSTRA-NEXT:    vs8r.v v16, (a0)
+; POSTRA-NEXT:    vfmv.v.f v8, fa0
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    fsd fa0, 0(a0)
+; POSTRA-NEXT:    ret
+;
+; PRERA-LABEL: vfmv.v.f:
+; PRERA:       # %bb.0:
+; PRERA-NEXT:    addi sp, sp, -16
+; PRERA-NEXT:    .cfi_def_cfa_offset 16
+; PRERA-NEXT:    csrr a1, vlenb
+; PRERA-NEXT:    slli a1, a1, 3
+; PRERA-NEXT:    sub sp, sp, a1
+; PRERA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; PRERA-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; PRERA-NEXT:    vfmv.v.f v8, fa0
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    addi a1, sp, 16
+; PRERA-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; PRERA-NEXT:    vl8re64.v v24, (a0)
+; PRERA-NEXT:    vl8re64.v v0, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v0, (a0)
+; PRERA-NEXT:    vs8r.v v24, (a0)
+; PRERA-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    fsd fa0, 0(a0)
+; PRERA-NEXT:    csrr a0, vlenb
+; PRERA-NEXT:    slli a0, a0, 3
+; PRERA-NEXT:    add sp, sp, a0
+; PRERA-NEXT:    addi sp, sp, 16
+; PRERA-NEXT:    ret
+  %vfmv.v.f = call <vscale x 8 x double> @llvm.riscv.vfmv.v.f.nxv8f64(<vscale x 8 x double> poison, double %x, i64 -1)
+  store volatile <vscale x 8 x double> %vfmv.v.f, ptr %p
+
+  %a = load volatile <vscale x 8 x double>, ptr %p
+  %b = load volatile <vscale x 8 x double>, ptr %p
+  %c = load volatile <vscale x 8 x double>, ptr %p
+  %d = load volatile <vscale x 8 x double>, ptr %p
+  store volatile <vscale x 8 x double> %d, ptr %p
+  store volatile <vscale x 8 x double> %c, ptr %p
+  store volatile <vscale x 8 x double> %b, ptr %p
+  store volatile <vscale x 8 x double> %a, ptr %p
+
+  store volatile <vscale x 8 x double> %vfmv.v.f, ptr %p
+  store volatile double %x, ptr %p
+  ret void
+}
+
+define void @vmv.s.x(ptr %p, i64 %x) {
+; POSTRA-LABEL: vmv.s.x:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; POSTRA-NEXT:    vmv.s.x v8, a1
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vl8re64.v v16, (a0)
+; POSTRA-NEXT:    vl8re64.v v24, (a0)
+; POSTRA-NEXT:    vl8re64.v v0, (a0)
+; POSTRA-NEXT:    vl8re64.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v0, (a0)
+; POSTRA-NEXT:    vs8r.v v24, (a0)
+; POSTRA-NEXT:    vs8r.v v16, (a0)
+; POSTRA-NEXT:    vmv.s.x v8, a1
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    sd a1, 0(a0)
+; POSTRA-NEXT:    ret
+;
+; PRERA-LABEL: vmv.s.x:
+; PRERA:       # %bb.0:
+; PRERA-NEXT:    addi sp, sp, -16
+; PRERA-NEXT:    .cfi_def_cfa_offset 16
+; PRERA-NEXT:    csrr a2, vlenb
+; PRERA-NEXT:    slli a2, a2, 3
+; PRERA-NEXT:    sub sp, sp, a2
+; PRERA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; PRERA-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; PRERA-NEXT:    vmv.s.x v8, a1
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    addi a2, sp, 16
+; PRERA-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; PRERA-NEXT:    vl8re64.v v24, (a0)
+; PRERA-NEXT:    vl8re64.v v0, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v0, (a0)
+; PRERA-NEXT:    vs8r.v v24, (a0)
+; PRERA-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    sd a1, 0(a0)
+; PRERA-NEXT:    csrr a0, vlenb
+; PRERA-NEXT:    slli a0, a0, 3
+; PRERA-NEXT:    add sp, sp, a0
+; PRERA-NEXT:    addi sp, sp, 16
+; PRERA-NEXT:    ret
+  %vmv.s.x = call <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64> poison, i64 %x, i64 -1)
+  store volatile <vscale x 8 x i64> %vmv.s.x, ptr %p
+
+  %a = load volatile <vscale x 8 x i64>, ptr %p
+  %b = load volatile <vscale x 8 x i64>, ptr %p
+  %c = load volatile <vscale x 8 x i64>, ptr %p
+  %d = load volatile <vscale x 8 x i64>, ptr %p
+  store volatile <vscale x 8 x i64> %d, ptr %p
+  store volatile <vscale x 8 x i64> %c, ptr %p
+  store volatile <vscale x 8 x i64> %b, ptr %p
+  store volatile <vscale x 8 x i64> %a, ptr %p
+
+  store volatile <vscale x 8 x i64> %vmv.s.x, ptr %p
+  store volatile i64 %x, ptr %p
+  ret void
+}
+
+define void @vfmv.s.f(ptr %p, double %x) {
+; POSTRA-LABEL: vfmv.s.f:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; POSTRA-NEXT:    vfmv.s.f v8, fa0
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vl8re64.v v16, (a0)
+; POSTRA-NEXT:    vl8re64.v v24, (a0)
+; POSTRA-NEXT:    vl8re64.v v0, (a0)
+; POSTRA-NEXT:    vl8re64.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v0, (a0)
+; POSTRA-NEXT:    vs8r.v v24, (a0)
+; POSTRA-NEXT:    vs8r.v v16, (a0)
+; POSTRA-NEXT:    vfmv.s.f v8, fa0
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    fsd fa0, 0(a0)
+; POSTRA-NEXT:    ret
+;
+; PRERA-LABEL: vfmv.s.f:
+; PRERA:       # %bb.0:
+; PRERA-NEXT:    addi sp, sp, -16
+; PRERA-NEXT:    .cfi_def_cfa_offset 16
+; PRERA-NEXT:    csrr a1, vlenb
+; PRERA-NEXT:    slli a1, a1, 3
+; PRERA-NEXT:    sub sp, sp, a1
+; PRERA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; PRERA-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; PRERA-NEXT:    vfmv.s.f v8, fa0
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    addi a1, sp, 16
+; PRERA-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; PRERA-NEXT:    vl8re64.v v24, (a0)
+; PRERA-NEXT:    vl8re64.v v0, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v0, (a0)
+; PRERA-NEXT:    vs8r.v v24, (a0)
+; PRERA-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    fsd fa0, 0(a0)
+; PRERA-NEXT:    csrr a0, vlenb
+; PRERA-NEXT:    slli a0, a0, 3
+; PRERA-NEXT:    add sp, sp, a0
+; PRERA-NEXT:    addi sp, sp, 16
+; PRERA-NEXT:    ret
+  %vfmv.s.f = call <vscale x 8 x double> @llvm.riscv.vfmv.s.f.nxv8f64(<vscale x 8 x double> poison, double %x, i64 -1)
+  store volatile <vscale x 8 x double> %vfmv.s.f, ptr %p
+
+  %a = load volatile <vscale x 8 x double>, ptr %p
+  %b = load volatile <vscale x 8 x double>, ptr %p
+  %c = load volatile <vscale x 8 x double>, ptr %p
+  %d = load volatile <vscale x 8 x double>, ptr %p
+  store volatile <vscale x 8 x double> %d, ptr %p
+  store volatile <vscale x 8 x double> %c, ptr %p
+  store volatile <vscale x 8 x double> %b, ptr %p
+  store volatile <vscale x 8 x double> %a, ptr %p
+
+  store volatile <vscale x 8 x double> %vfmv.s.f, ptr %p
+  store volatile double %x, ptr %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
index be6ed4d2a6aa1..ed274cf49fa9b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
@@ -14,9 +14,9 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -52,7 +52,7 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_0
@@ -92,7 +92,7 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_3
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -130,7 +130,7 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_2
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -166,7 +166,7 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -239,11 +239,11 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -279,9 +279,9 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_0
@@ -319,11 +319,11 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_2
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
     ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_3
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -359,11 +359,11 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_3
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
     ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_2
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -401,9 +401,9 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_5
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -441,9 +441,9 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_4
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -481,9 +481,9 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_7
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -521,9 +521,9 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_6
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -559,9 +559,9 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -597,7 +597,7 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
     ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0
@@ -637,7 +637,7 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -675,7 +675,7 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -711,7 +711,7 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M4_]], %subreg.sub_vrm4_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index 7f2e3cdbfd0e3..7d78fa5a8f3ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -22,10 +22,10 @@ define half @vpreduce_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1
 ;
 ; ZVFHMIN-LABEL: vpreduce_fadd_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v9, v8, v0.t
@@ -48,10 +48,10 @@ define half @vpreduce_ord_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale
 ;
 ; ZVFHMIN-LABEL: vpreduce_ord_fadd_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v9, v8, v0.t
@@ -76,10 +76,10 @@ define half @vpreduce_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2
 ;
 ; ZVFHMIN-LABEL: vpreduce_fadd_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v9, v8, v0.t
@@ -102,10 +102,10 @@ define half @vpreduce_ord_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale
 ;
 ; ZVFHMIN-LABEL: vpreduce_ord_fadd_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v9, v8, v0.t
@@ -130,10 +130,10 @@ define half @vpreduce_fadd_nxv4f16(half %s, <vscale x 4 x half> %v, <vscale x 4
 ;
 ; ZVFHMIN-LABEL: vpreduce_fadd_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v10, v8, v0.t
@@ -156,10 +156,10 @@ define half @vpreduce_ord_fadd_nxv4f16(half %s, <vscale x 4 x half> %v, <vscale
 ;
 ; ZVFHMIN-LABEL: vpreduce_ord_fadd_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v10, v8, v0.t
@@ -233,10 +233,10 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a0, a4
 ; ZVFHMIN-NEXT:  .LBB6_6:
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v24, v8, v0.t
@@ -245,20 +245,20 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
@@ -267,9 +267,9 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa0, fa5
@@ -339,10 +339,10 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a0, a4
 ; ZVFHMIN-NEXT:  .LBB7_6:
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v24, v8, v0.t
@@ -351,20 +351,20 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
@@ -373,9 +373,9 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa0, fa5
diff --git a/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll
new file mode 100644
index 0000000000000..5f9229f5a5bd6
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll
@@ -0,0 +1,64 @@
+; RUN: llc -O2 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O2 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O2 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O2 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#Char:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#PtrChar:]] = OpTypePointer CrossWorkgroup %[[#Char]]
+; CHECK-DAG: %[[#Int:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#PtrInt:]] = OpTypePointer CrossWorkgroup %[[#Int]]
+; CHECK-DAG: %[[#C648:]] = OpConstant %[[#]] 648
+; CHECK-DAG: %[[#Struct:]] = OpTypeStruct %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]]
+; CHECK-DAG: %[[#VarInit:]] = OpConstantNull %[[#Struct]]
+; CHECK-DAG: %[[#PtrStruct:]] = OpTypePointer CrossWorkgroup %[[#Struct]]
+; CHECK-DAG: %[[#Var:]] = OpVariable %[[#PtrStruct]] CrossWorkgroup %[[#VarInit]]
+; CHECK-DAG: %[[#Bytes:]] = OpVariable %[[#PtrChar]] CrossWorkgroup %[[#]]
+; CHECK-DAG: %[[#BytesGEP:]] = OpSpecConstantOp %[[#PtrChar]] 70 %[[#Bytes]] %[[#C648]]
+
+; CHECK: OpFunction
+; CHECK: %[[#]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#Line:]] = OpFunctionParameter %[[#Int]]
+; CHECK: %[[#]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#Casted:]] = OpBitcast %[[#PtrChar]] %[[#Var]]
+; CHECK: %[[#AddrChar:]] = OpInBoundsPtrAccessChain %[[#PtrChar]] %[[#Casted]] %[[#C648]]
+; CHECK: %[[#AddrInt:]] = OpBitcast %[[#PtrInt]] %[[#AddrChar]]
+; CHECK: OpStore %[[#AddrInt]] %[[#Line]]
+
+%struct = type { i32, [257 x i8], [257 x i8], [129 x i8], i32, i64, i64, i64, i64, i64, i64 }
+@Mem = linkonce_odr dso_local addrspace(1) global %struct zeroinitializer, align 8
+
+define weak dso_local spir_func void @foo(ptr addrspace(4) noundef %expr, i32 noundef %line, i1 %fl) {
+entry:
+  %cmp = icmp eq i32 %line, 0
+  br i1 %cmp, label %lbl, label %exit
+
+lbl:
+  store i32 %line, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @Mem, i64 648), align 8
+  br i1 %fl, label %lbl, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[#]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#Line2:]] = OpFunctionParameter %[[#Int]]
+; CHECK: %[[#]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#AddrInt2:]] = OpBitcast %[[#PtrInt]] %[[#BytesGEP]]
+; CHECK: OpStore %[[#AddrInt2]] %[[#Line2]]
+
+@Bytes = linkonce_odr dso_local addrspace(1) global i8 zeroinitializer, align 8
+
+define weak dso_local spir_func void @bar(ptr addrspace(4) noundef %expr, i32 noundef %line, i1 %fl) {
+entry:
+  %cmp = icmp eq i32 %line, 0
+  br i1 %cmp, label %lbl, label %exit
+
+lbl:
+  store i32 %line, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @Bytes, i64 648), align 8
+  br i1 %fl, label %lbl, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
index aa191209516f6..3327d8be894f0 100644
--- a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
+++ b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
@@ -109,7 +109,7 @@ ehcleanup:                                        ; preds = %entry
 }
 
 ; Calling a function that may throw within a 'catch (...)' generates a
-; temrinatepad, because __cxa_end_catch() also can throw within 'catch (...)'.
+; terminatepad, because __cxa_end_catch() also can throw within 'catch (...)'.
 ;
 ; void foo();
 ; void terminatepad() {
diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception.ll
new file mode 100644
index 0000000000000..7259761d6313b
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/exception.ll
@@ -0,0 +1,470 @@
+; RUN: llc < %s -asm-verbose=false -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref -verify-machineinstrs | FileCheck --implicit-check-not=ehgcr -allow-deprecated-dag-overlap %s
+; RUN: llc < %s -asm-verbose=false -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref -verify-machineinstrs -O0
+; RUN: llc < %s -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref
+
+target triple = "wasm32-unknown-unknown"
+
+%struct.Temp = type { i8 }
+
+@_ZTIi = external dso_local constant ptr
+
+; CHECK: .tagtype  __cpp_exception i32
+
+; CHECK-LABEL: throw:
+; CHECK:     throw __cpp_exception
+; CHECK-NOT: unreachable
+define void @throw(ptr %p) {
+  call void @llvm.wasm.throw(i32 0, ptr %p)
+  ret void
+}
+
+; Simple test with a try-catch
+;
+; void foo();
+; void catch() {
+;   try {
+;     foo();
+;   } catch (int) {
+;   }
+; }
+
+; CHECK-LABEL: catch:
+; CHECK: global.get  __stack_pointer
+; CHECK: local.set  0
+; CHECK: block
+; CHECK:   block     () -> (i32, exnref)
+; CHECK:     try_table    (catch_ref __cpp_exception 0)
+; CHECK:       call  foo
+; CHECK:       br        2
+; CHECK:     end_try_table
+; CHECK:   end_block
+; CHECK:   local.set  2
+; CHECK:   local.get  0
+; CHECK:   global.set  __stack_pointer
+; CHECK:   i32.store  __wasm_lpad_context
+; CHECK:   call  _Unwind_CallPersonality
+; CHECK:   block
+; CHECK:     br_if     0
+; CHECK:     call  __cxa_begin_catch
+; CHECK:     call  __cxa_end_catch
+; CHECK:     br        1
+; CHECK:   end_block
+; CHECK:   local.get  2
+; CHECK:   throw_ref
+; CHECK: end_block
+define void @catch() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr @_ZTIi]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch, label %rethrow
+
+catch:                                            ; preds = %catch.start
+  %5 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ]
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+rethrow:                                          ; preds = %catch.start
+  call void @llvm.wasm.rethrow() [ "funclet"(token %1) ]
+  unreachable
+
+try.cont:                                         ; preds = %catch, %entry
+  ret void
+}
+
+; Destructor (cleanup) test
+;
+; void foo();
+; struct Temp {
+;   ~Temp() {}
+; };
+; void cleanup() {
+;   Temp t;
+;   foo();
+; }
+
+; CHECK-LABEL: cleanup:
+; CHECK: block
+; CHECK:   block     exnref
+; CHECK:     try_table    (catch_all_ref 0)
+; CHECK:       call  foo
+; CHECK:       br        2
+; CHECK:     end_try_table
+; CHECK:   end_block
+; CHECK:   local.set  1
+; CHECK:   global.set  __stack_pointer
+; CHECK:   call  _ZN4TempD2Ev
+; CHECK:   local.get  1
+; CHECK:   throw_ref
+; CHECK: end_block
+; CHECK: call  _ZN4TempD2Ev
+define void @cleanup() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  %t = alloca %struct.Temp, align 1
+  invoke void @foo()
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  %call = call ptr @_ZN4TempD2Ev(ptr %t)
+  ret void
+
+ehcleanup:                                        ; preds = %entry
+  %0 = cleanuppad within none []
+  %call1 = call ptr @_ZN4TempD2Ev(ptr %t) [ "funclet"(token %0) ]
+  cleanupret from %0 unwind to caller
+}
+
+; Calling a function that may throw within a 'catch (...)' generates a
+; terminatepad, because __cxa_end_catch() also can throw within 'catch (...)'.
+;
+; void foo();
+; void terminatepad() {
+;   try {
+;     foo();
+;   } catch (...) {
+;     foo();
+;   }
+; }
+
+; CHECK-LABEL: terminatepad
+; CHECK: block
+; CHECK:   block     i32
+; CHECK:     try_table    (catch __cpp_exception 0)
+; CHECK:       call  foo
+; CHECK:       br        2
+; CHECK:     end_try_table
+; CHECK:   end_block
+; CHECK:   call  __cxa_begin_catch
+; CHECK:   block
+; CHECK:     block     exnref
+; CHECK:       try_table    (catch_all_ref 0)
+; CHECK:         call  foo
+; CHECK:         br        2
+; CHECK:       end_try_table
+; CHECK:     end_block
+; CHECK:     local.set  2
+; CHECK:     block
+; CHECK:       block
+; CHECK:         try_table    (catch_all 0)
+; CHECK:           call  __cxa_end_catch
+; CHECK:           br        2
+; CHECK:         end_try_table
+; CHECK:       end_block
+; CHECK:       call  _ZSt9terminatev
+; CHECK:       unreachable
+; CHECK:     end_block
+; CHECK:     local.get  2
+; CHECK:     throw_ref
+; CHECK:   end_block
+; CHECK:   call  __cxa_end_catch
+; CHECK: end_block
+define void @terminatepad() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr null]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ]
+  invoke void @foo() [ "funclet"(token %1) ]
+          to label %invoke.cont1 unwind label %ehcleanup
+
+invoke.cont1:                                     ; preds = %catch.start
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont1, %entry
+  ret void
+
+ehcleanup:                                        ; preds = %catch.start
+  %5 = cleanuppad within %1 []
+  invoke void @__cxa_end_catch() [ "funclet"(token %5) ]
+          to label %invoke.cont2 unwind label %terminate
+
+invoke.cont2:                                     ; preds = %ehcleanup
+  cleanupret from %5 unwind to caller
+
+terminate:                                        ; preds = %ehcleanup
+  %6 = cleanuppad within %5 []
+  call void @_ZSt9terminatev() [ "funclet"(token %6) ]
+  unreachable
+}
+
+; Tests prologues and epilogues are not generated within EH scopes.
+; They should not be treated as funclets; BBs starting with a catch instruction
+; should not have a prologue, and BBs ending with a catchret/cleanupret should
+; not have an epilogue. This is separate from __stack_pointer restoring
+; instructions after a catch instruction.
+;
+; void bar(int) noexcept;
+; void no_prolog_epilog_in_ehpad() {
+;   int stack_var = 0;
+;   bar(stack_var);
+;   try {
+;     foo();
+;   } catch (int) {
+;     foo();
+;   }
+; }
+
+; CHECK-LABEL: no_prolog_epilog_in_ehpad
+; CHECK:   call  bar
+; CHECK:   block
+; CHECK:     block     () -> (i32, exnref)
+; CHECK:       try_table    (catch_ref __cpp_exception 0)
+; CHECK:         call  foo
+; CHECK:         br        2
+; CHECK:       end_try_table
+; CHECK:     end_block
+; CHECK:     local.set  2
+; CHECK-NOT: global.get  __stack_pointer
+; CHECK:     global.set  __stack_pointer
+; CHECK:     block
+; CHECK:       block
+; CHECK:         br_if     0
+; CHECK:         call  __cxa_begin_catch
+; CHECK:         block     exnref
+; CHECK:           try_table    (catch_all_ref 0)
+; CHECK:             call  foo
+; CHECK:             br        3
+; CHECK:           end_try_table
+; CHECK:         end_block
+; CHECK:         local.set  2
+; CHECK-NOT:     global.get  __stack_pointer
+; CHECK:         global.set  __stack_pointer
+; CHECK:         call  __cxa_end_catch
+; CHECK:         local.get  2
+; CHECK:         throw_ref
+; CHECK-NOT:     global.set  __stack_pointer
+; CHECK:       end_block
+; CHECK:       local.get  2
+; CHECK:       throw_ref
+; CHECK:     end_block
+; CHECK-NOT: global.set  __stack_pointer
+; CHECK:     call  __cxa_end_catch
+; CHECK:   end_block
+define void @no_prolog_epilog_in_ehpad() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  %stack_var = alloca i32, align 4
+  call void @bar(ptr %stack_var)
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr @_ZTIi]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch, label %rethrow
+
+catch:                                            ; preds = %catch.start
+  %5 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ]
+  %6 = load float, ptr %5, align 4
+  invoke void @foo() [ "funclet"(token %1) ]
+          to label %invoke.cont1 unwind label %ehcleanup
+
+invoke.cont1:                                     ; preds = %catch
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+rethrow:                                          ; preds = %catch.start
+  call void @llvm.wasm.rethrow() [ "funclet"(token %1) ]
+  unreachable
+
+try.cont:                                         ; preds = %invoke.cont1, %entry
+  ret void
+
+ehcleanup:                                        ; preds = %catch
+  %7 = cleanuppad within %1 []
+  call void @__cxa_end_catch() [ "funclet"(token %7) ]
+  cleanupret from %7 unwind to caller
+}
+
+; When a function does not have stack-allocated objects, it does not need to
+; store SP back to __stack_pointer global at the epilog.
+;
+; void foo();
+; void no_sp_writeback() {
+;   try {
+;     foo();
+;   } catch (...) {
+;   }
+; }
+
+; CHECK-LABEL: no_sp_writeback
+; CHECK:     block
+; CHECK:       block     i32
+; CHECK:         try_table    (catch __cpp_exception 0)
+; CHECK:           call  foo
+; CHECK:           br        2
+; CHECK:         end_try_table
+; CHECK:       end_block
+; CHECK:       call  __cxa_begin_catch
+; CHECK:       call  __cxa_end_catch
+; CHECK:     end_block
+; CHECK-NOT: global.set  __stack_pointer
+; CHECK:     end_function
+define void @no_sp_writeback() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr null]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ]
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %catch.start, %entry
+  ret void
+}
+
+; When the result of @llvm.wasm.get.exception is not used. This is created to
+; fix a bug in LateEHPrepare and this should not crash.
+define void @get_exception_wo_use() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr null]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %catch.start, %entry
+  ret void
+}
+
+; Tests a case when a cleanup region (cleanuppad ~ clanupret) contains another
+; catchpad
+define void @complex_cleanup_region() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  ret void
+
+ehcleanup:                                        ; preds = %entry
+  %0 = cleanuppad within none []
+  invoke void @foo() [ "funclet"(token %0) ]
+          to label %ehcleanupret unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %ehcleanup
+  %1 = catchswitch within %0 [label %catch.start] unwind label %ehcleanup.1
+
+catch.start:                                      ; preds = %catch.dispatch
+  %2 = catchpad within %1 [ptr null]
+  %3 = call ptr @llvm.wasm.get.exception(token %2)
+  %4 = call i32 @llvm.wasm.get.ehselector(token %2)
+  catchret from %2 to label %ehcleanupret
+
+ehcleanup.1:                                      ; preds = %catch.dispatch
+  %5 = cleanuppad within %0 []
+  unreachable
+
+ehcleanupret:                                     ; preds = %catch.start, %ehcleanup
+  cleanupret from %0 unwind to caller
+}
+
+; Regression test for the bug that 'rethrow' was not treated correctly as a
+; terminator in isel.
+define void @rethrow_terminator() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind label %ehcleanup
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr @_ZTIi]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch, label %rethrow
+
+catch:                                            ; preds = %catch.start
+  %5 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ]
+  %6 = load i32, ptr %5, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+rethrow:                                          ; preds = %catch.start
+  invoke void @llvm.wasm.rethrow() #1 [ "funclet"(token %1) ]
+          to label %unreachable unwind label %ehcleanup
+
+try.cont:                                         ; preds = %entry, %catch
+  ret void
+
+ehcleanup:                                        ; preds = %rethrow, %catch.dispatch
+  ; 'rethrow' BB is this BB's predecessor, and its
+  ; 'invoke void @llvm.wasm.rethrow()' is lowered down to a 'RETHROW' in Wasm
+  ; MIR. And this 'phi' creates 'CONST_I32' instruction in the predecessor
+  ; 'rethrow' BB. If 'RETHROW' is not treated correctly as a terminator, it can
+  ; create a BB like
+  ; bb.3.rethrow:
+  ;   RETHROW 0
+  ;   %0 = CONST_I32 20
+  ;   BR ...
+  %tmp = phi i32 [ 10, %catch.dispatch ], [ 20, %rethrow ]
+  %7 = cleanuppad within none []
+  call void @take_i32(i32 %tmp) [ "funclet"(token %7) ]
+  cleanupret from %7 unwind to caller
+
+unreachable:                                      ; preds = %rethrow
+  unreachable
+}
+
+
+declare void @foo()
+declare void @bar(ptr)
+declare void @take_i32(i32)
+declare i32 @__gxx_wasm_personality_v0(...)
+; Function Attrs: noreturn
+declare void @llvm.wasm.throw(i32, ptr) #1
+; Function Attrs: nounwind
+declare ptr @llvm.wasm.get.exception(token) #0
+; Function Attrs: nounwind
+declare i32 @llvm.wasm.get.ehselector(token) #0
+; Function Attrs: noreturn
+declare void @llvm.wasm.rethrow() #1
+; Function Attrs: nounwind
+declare i32 @llvm.eh.typeid.for(ptr) #0
+declare ptr @__cxa_begin_catch(ptr)
+declare void @__cxa_end_catch()
+declare void @_ZSt9terminatev()
+declare ptr @_ZN4TempD2Ev(ptr returned)
+
+attributes #0 = { nounwind }
+attributes #1 = { noreturn }
+
+; CHECK: __cpp_exception:
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index b9b3436dd1ed9..6e22d855dc831 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -112,8 +112,7 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -144,8 +143,7 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -176,8 +174,7 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -208,8 +205,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32:
@@ -217,8 +213,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %aext = sext i32 %a to i64
   %bext = sext i32 %b to i64
@@ -237,8 +232,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32_i16:
@@ -247,8 +241,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    subl %eax, %ecx
 ; X64-NEXT:    subl %edi, %eax
-; X64-NEXT:    cmovll %ecx, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovgel %ecx, %eax
 ; X64-NEXT:    retq
   %aext = sext i32 %a to i64
   %bext = sext i16 %b to i64
@@ -267,8 +260,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32_undef:
@@ -276,8 +268,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %aext = sext i32 %a to i64
   %bext = sext i32 %b to i64
@@ -319,8 +310,7 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %aext = sext i64 %a to i128
   %bext = sext i64 %b to i128
@@ -362,8 +352,7 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %aext = sext i64 %a to i128
   %bext = sext i64 %b to i128
@@ -558,8 +547,7 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -587,8 +575,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_minmax_i32:
@@ -596,8 +583,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %min = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.smax.i32(i32 %a, i32 %b)
@@ -641,8 +627,7 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %min = call i64 @llvm.smin.i64(i64 %a, i64 %b)
   %max = call i64 @llvm.smax.i64(i64 %a, i64 %b)
@@ -776,8 +761,7 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -806,8 +790,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i32:
@@ -815,8 +798,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %cmp = icmp sge i32 %a, %b
   %ab = sub i32 %a, %b
@@ -853,8 +835,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %cmp = icmp slt i64 %a, %b
   %ab = sub i64 %a, %b
@@ -1031,8 +1012,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_subnsw_i32:
@@ -1040,8 +1020,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub nsw i32 %a, %b
   %abs = call i32 @llvm.abs.i32(i32 %sub, i1 false)
@@ -1057,8 +1036,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_subnsw_i32_undef:
@@ -1066,8 +1044,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub nsw i32 %a, %b
   %abs = call i32 @llvm.abs.i32(i32 %sub, i1 true)
@@ -1098,8 +1075,7 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = sub nsw i64 %a, %b
   %abs = call i64 @llvm.abs.i64(i64 %sub, i1 false)
@@ -1130,8 +1106,7 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = sub nsw i64 %a, %b
   %abs = call i64 @llvm.abs.i64(i64 %sub, i1 true)
diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll
index 9c4c059a3b9bf..4c524c28b160a 100644
--- a/llvm/test/CodeGen/X86/abds.ll
+++ b/llvm/test/CodeGen/X86/abds.ll
@@ -1154,6 +1154,211 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind {
   ret i32 %abs
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; X86-LABEL: abd_select_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    subb %dl, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cmpb %sil, %al
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    cmovll %edi, %ecx
+; X64-NEXT:    cmovll %esi, %eax
+; X64-NEXT:    subb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %cmp = icmp slt i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; X86-LABEL: abd_select_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    cmovlel %eax, %edx
+; X86-NEXT:    cmovlel %ecx, %eax
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cmpw %si, %ax
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    cmovlel %edi, %ecx
+; X64-NEXT:    cmovlel %esi, %eax
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %cmp = icmp sle i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; X86-LABEL: abd_select_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmovgl %edx, %eax
+; X86-NEXT:    cmovgl %ecx, %edx
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmovgl %edi, %eax
+; X64-NEXT:    cmovgl %esi, %edi
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    retq
+  %cmp = icmp sgt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; X86-LABEL: abd_select_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %esi, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    cmovgel %ecx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    cmovgel %ebx, %eax
+; X86-NEXT:    cmovgel %edi, %ecx
+; X86-NEXT:    cmovgel %esi, %ebx
+; X86-NEXT:    subl %ebx, %eax
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmovgeq %rdi, %rax
+; X64-NEXT:    cmovgeq %rsi, %rdi
+; X64-NEXT:    subq %rdi, %rax
+; X64-NEXT:    retq
+  %cmp = icmp sge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; X86-LABEL: abd_select_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    cmovll %ebx, %edi
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    cmovll %ecx, %ebx
+; X86-NEXT:    cmovll %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    cmovll %esi, %ebp
+; X86-NEXT:    cmovll %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmovll %edx, %eax
+; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    sbbl %ebp, %esi
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: abd_select_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    sbbq %rcx, %rdi
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    cmovlq %rsi, %rdi
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    cmovlq %rax, %r8
+; X64-NEXT:    cmovlq %rcx, %rsi
+; X64-NEXT:    cmovlq %rdx, %rax
+; X64-NEXT:    subq %r8, %rax
+; X64-NEXT:    sbbq %rdi, %rsi
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    retq
+  %cmp = icmp slt i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
 
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 1ded7e79e2510..6bda99c89a37e 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -112,8 +112,7 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -144,8 +143,7 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -176,8 +174,7 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -208,8 +205,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32:
@@ -217,8 +213,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovael %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovbl %esi, %eax
 ; X64-NEXT:    retq
   %aext = zext i32 %a to i64
   %bext = zext i32 %b to i64
@@ -237,8 +232,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32_i16:
@@ -247,8 +241,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    subl %eax, %ecx
 ; X64-NEXT:    subl %edi, %eax
-; X64-NEXT:    cmovbl %ecx, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovael %ecx, %eax
 ; X64-NEXT:    retq
   %aext = zext i32 %a to i64
   %bext = zext i16 %b to i64
@@ -267,8 +260,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32_undef:
@@ -276,8 +268,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovael %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovbl %esi, %eax
 ; X64-NEXT:    retq
   %aext = zext i32 %a to i64
   %bext = zext i32 %b to i64
@@ -313,8 +304,7 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovaeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovbq %rsi, %rax
 ; X64-NEXT:    retq
   %aext = zext i64 %a to i128
   %bext = zext i64 %b to i128
@@ -350,8 +340,7 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovaeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovbq %rsi, %rax
 ; X64-NEXT:    retq
   %aext = zext i64 %a to i128
   %bext = zext i64 %b to i128
@@ -540,8 +529,7 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -569,8 +557,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_minmax_i32:
@@ -578,8 +565,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovael %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovbl %esi, %eax
 ; X64-NEXT:    retq
   %min = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %a, i32 %b)
@@ -623,8 +609,7 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovaeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovbq %rsi, %rax
 ; X64-NEXT:    retq
   %min = call i64 @llvm.umin.i64(i64 %a, i64 %b)
   %max = call i64 @llvm.umax.i64(i64 %a, i64 %b)
@@ -758,8 +743,7 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -788,8 +772,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i32:
@@ -797,8 +780,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovael %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovbl %esi, %eax
 ; X64-NEXT:    retq
   %cmp = icmp uge i32 %a, %b
   %ab = sub i32 %a, %b
@@ -832,8 +814,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovaeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovbq %rsi, %rax
 ; X64-NEXT:    retq
   %cmp = icmp ult i64 %a, %b
   %ab = sub i64 %a, %b
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index 335fa8c156f8e..fe9006a8aec23 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -768,6 +768,212 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
   ret i128 %sel
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abdu(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; X86-LABEL: abd_select_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    subb %dl, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cmpb %sil, %al
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    cmovbl %edi, %ecx
+; X64-NEXT:    cmovbl %esi, %eax
+; X64-NEXT:    subb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %cmp = icmp ult i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; X86-LABEL: abd_select_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    cmovbel %eax, %edx
+; X86-NEXT:    cmovbel %ecx, %eax
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cmpw %si, %ax
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    cmovbel %edi, %ecx
+; X64-NEXT:    cmovbel %esi, %eax
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %cmp = icmp ule i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; X86-LABEL: abd_select_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmoval %edx, %eax
+; X86-NEXT:    cmoval %ecx, %edx
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmoval %edi, %eax
+; X64-NEXT:    cmoval %esi, %edi
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    retq
+  %cmp = icmp ugt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; X86-LABEL: abd_select_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %esi, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    cmovael %ecx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    cmovael %ebx, %eax
+; X86-NEXT:    cmovael %edi, %ecx
+; X86-NEXT:    cmovael %esi, %ebx
+; X86-NEXT:    subl %ebx, %eax
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmovaeq %rdi, %rax
+; X64-NEXT:    cmovaeq %rsi, %rdi
+; X64-NEXT:    subq %rdi, %rax
+; X64-NEXT:    retq
+  %cmp = icmp uge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; X86-LABEL: abd_select_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    cmovbl %ebx, %edi
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    cmovbl %ecx, %ebx
+; X86-NEXT:    cmovbl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    cmovbl %esi, %ebp
+; X86-NEXT:    cmovbl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmovbl %edx, %eax
+; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    sbbl %ebp, %esi
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: abd_select_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    sbbq %rcx, %rdi
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    cmovbq %rsi, %rdi
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    cmovbq %rax, %r8
+; X64-NEXT:    cmovbq %rcx, %rsi
+; X64-NEXT:    cmovbq %rdx, %rax
+; X64-NEXT:    subq %r8, %rax
+; X64-NEXT:    sbbq %rdi, %rsi
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    retq
+  %cmp = icmp ult i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
+
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
 declare i32 @llvm.abs.i32(i32, i1)
diff --git a/llvm/test/Transforms/LoopDeletion/noalias.ll b/llvm/test/Transforms/LoopDeletion/noalias.ll
new file mode 100644
index 0000000000000..0f3b71df94270
--- /dev/null
+++ b/llvm/test/Transforms/LoopDeletion/noalias.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-deletion -S | FileCheck %s
+
+define void @pr108052(i64 %n) {
+; CHECK-LABEL: define void @pr108052(
+; CHECK-SAME: i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK:       [[FOR_EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.exit:
+  ret void
+
+for.body:
+  %indvar = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  %inc = add nuw i64 %indvar, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.exit, label %for.body
+}
+
+!0 = !{!1}
+!1 = distinct !{!1, !2, !"x: %a"}
+!2 = distinct !{!2, !"x"}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 54c7299f6db0f..6d309c4453c7e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -322,7 +322,87 @@ return:
   ret i32 0
 }
 
+; Test case for https://github.com/llvm/llvm-project/issues/107473.
+define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
+; CHECK-LABEL: define void @test_phi_in_latch_redundant(
+; CHECK-SAME: ptr [[DST:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 37, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 37, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 37, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 9
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 9, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 9, [[TMP5]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i32> [[BROADCAST_SPLAT]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 -1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[TMP10]], <vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 37, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    br i1 false, label %[[LOOP_LATCH]], label %[[THEN:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[NOT_A:%.*]] = xor i32 [[A]], -1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[NOT_A]], %[[THEN]] ], [ 0, %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[P]], ptr [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 9
+; CHECK-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV]], 322
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
 
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 false, label %loop.latch, label %then
+
+then:
+  %not.a = xor i32 %a, -1
+  br label %loop.latch
+
+loop.latch:
+  %p = phi i32 [ %not.a, %then ], [ 0, %loop.header ]
+  %gep = getelementptr i32, ptr %dst, i64 %iv
+  store i32 %p, ptr %gep, align 4
+  %iv.next = add i64 %iv, 9
+  %ec = icmp slt i64 %iv, 322
+  br i1 %ec, label %loop.header, label %exit
+
+exit:
+  ret void
+}
 
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
@@ -343,4 +423,6 @@ return:
 ; CHECK: [[META15]] = distinct !{[[META15]], [[META13]]}
 ; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]}
+; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index a28b0542a7c59..ba260752ce4b5 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -472,9 +472,7 @@ define void @immut_param_mayalias(ptr align 4 noalias %val) {
 ; argument doesn't matter.
 define void @immut_param_unescaped_alloca(ptr align 4 noalias %val) {
 ; CHECK-LABEL: @immut_param_unescaped_alloca(
-; CHECK-NEXT:    [[VAL1:%.*]] = alloca i8, align 4
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VAL1]], ptr align 4 [[VAL:%.*]], i64 1, i1 false)
-; CHECK-NEXT:    call void @f(ptr nocapture readonly align 4 [[VAL1]])
+; CHECK-NEXT:    call void @f(ptr nocapture readonly align 4 [[VAL:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %val1 = alloca i8, align 4
@@ -489,8 +487,7 @@ define void @immut_param_memory_argmem_read(ptr align 4 noalias %val) {
 ; CHECK-LABEL: @immut_param_memory_argmem_read(
 ; CHECK-NEXT:    [[VAL1:%.*]] = alloca i8, align 4
 ; CHECK-NEXT:    call void @f(ptr [[VAL1]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VAL1]], ptr align 4 [[VAL:%.*]], i64 1, i1 false)
-; CHECK-NEXT:    call void @f(ptr nocapture readonly align 4 [[VAL1]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    call void @f(ptr nocapture readonly align 4 [[VAL:%.*]]) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   %val1 = alloca i8, align 4
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll b/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll
new file mode 100644
index 0000000000000..03aee68fa4248
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=simplifycfg -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local noundef i32 @main() {
+; CHECK-LABEL: define dso_local noundef i32 @main() {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x ptr], align 16
+; CHECK-NEXT:    store ptr blockaddress(@main, %[[BB4:.*]]), ptr [[ALLOCA]], align 16, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds [2 x ptr], ptr [[ALLOCA]], i64 0, i64 1
+; CHECK-NEXT:    store ptr blockaddress(@main, %[[BB10:.*]]), ptr [[GETELEMENTPTR]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[PHI8:%.*]], %[[BB7:.*]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[PHI9:%.*]], %[[BB7]] ]
+; CHECK-NEXT:    switch i32 [[PHI]], label %[[BB7]] [
+; CHECK-NEXT:      i32 0, label %[[BB12:.*]]
+; CHECK-NEXT:      i32 1, label %[[BB4]]
+; CHECK-NEXT:      i32 2, label %[[BB6:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[PHI5:%.*]] = phi i32 [ [[PHI13:%.*]], %[[BB12]] ], [ [[PHI2]], %[[BB1]] ]
+; CHECK-NEXT:    br label %[[BB7]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @foo(i32 noundef [[PHI2]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[PHI2]], 1
+; CHECK-NEXT:    br label %[[BB12]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[PHI8]] = phi i32 [ [[PHI]], %[[BB1]] ], [ 2, %[[BB4]] ]
+; CHECK-NEXT:    [[PHI9]] = phi i32 [ [[PHI2]], %[[BB1]] ], [ [[PHI5]], %[[BB4]] ]
+; CHECK-NEXT:    br label %[[BB1]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[CALL11:%.*]] = call i32 @foo(i32 noundef [[PHI13]])
+; CHECK-NEXT:    ret i32 0
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[PHI13]] = phi i32 [ [[ADD]], %[[BB6]] ], [ [[PHI2]], %[[BB1]] ]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[PHI13]] to i64
+; CHECK-NEXT:    [[GETELEMENTPTR14:%.*]] = getelementptr inbounds [2 x ptr], ptr [[ALLOCA]], i64 0, i64 [[SEXT]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[GETELEMENTPTR14]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    indirectbr ptr [[LOAD]], [label %[[BB4]], label %bb10]
+;
+bb:
+  %alloca = alloca [2 x ptr], align 16
+  store ptr blockaddress(@main, %bb4), ptr %alloca, align 16, !tbaa !0
+  %getelementptr = getelementptr inbounds [2 x ptr], ptr %alloca, i64 0, i64 1
+  store ptr blockaddress(@main, %bb10), ptr %getelementptr, align 8, !tbaa !0
+  br label %bb1
+
+bb1:                                              ; preds = %bb7, %bb
+  %phi = phi i32 [ 0, %bb ], [ %phi8, %bb7 ]
+  %phi2 = phi i32 [ 0, %bb ], [ %phi9, %bb7 ]
+  switch i32 %phi, label %bb7 [
+  i32 0, label %bb3
+  i32 1, label %bb4
+  i32 2, label %bb6
+  ]
+
+bb3:                                              ; preds = %bb1
+  br label %bb12
+
+bb4:                                              ; preds = %bb12, %bb1
+  %phi5 = phi i32 [ %phi13, %bb12 ], [ %phi2, %bb1 ]
+  br label %bb7
+
+bb6:                                              ; preds = %bb1
+  %call = call i32 @foo(i32 noundef %phi2)
+  %add = add nsw i32 %phi2, 1
+  br label %bb12
+
+bb7:                                              ; preds = %bb4, %bb1
+  %phi8 = phi i32 [ %phi, %bb1 ], [ 2, %bb4 ]
+  %phi9 = phi i32 [ %phi2, %bb1 ], [ %phi5, %bb4 ]
+  br label %bb1, !llvm.loop !4
+
+bb10:                                             ; preds = %bb12
+  %call11 = call i32 @foo(i32 noundef %phi13)
+  ret i32 0
+
+bb12:                                             ; preds = %bb6, %bb3
+  %phi13 = phi i32 [ %add, %bb6 ], [ %phi2, %bb3 ]
+  %sext = sext i32 %phi13 to i64
+  %getelementptr14 = getelementptr inbounds [2 x ptr], ptr %alloca, i64 0, i64 %sext
+  %load = load ptr, ptr %getelementptr14, align 8, !tbaa !0
+  indirectbr ptr %load, [label %bb4, label %bb10]
+}
+
+declare i32 @foo(i32)
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"any pointer", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !2, i64 0}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"any pointer", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK: [[LOOP4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"int", [[META2]], i64 0}
+;.
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected
index 71e82eca6c3e3..936efa378c1a4 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected
@@ -12,10 +12,10 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 ; CHECK-NEXT:    t24: i32 = ADD_R t5, t22, TargetConstant:i32<0>
 ; CHECK-NEXT:    t3: i32,ch = LDW_RI<Mem:(load (s32) from %fixed-stack.1, align 8)> TargetFrameIndex:i32<-1>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0
 ; CHECK-NEXT:    t19: i32,ch = LDW_RI<Mem:(dereferenceable load (s32) from %ir.loc, align 8)> TargetFrameIndex:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0
-; CHECK-NEXT:    t25: i32 = ADD_R t3, t19, TargetConstant:i32<0>
+; CHECK-NEXT:    t27: i32 = ADD_R t3, t19, TargetConstant:i32<0>
 ; CHECK-NEXT:    t30: i32,glue = SFSUB_F_RR t24, t5
 ; CHECK-NEXT:    t31: i32 = SCC TargetConstant:i32<4>, t30:1
-; CHECK-NEXT:    t28: i32 = ADD_R t25, t31, TargetConstant:i32<0>
+; CHECK-NEXT:    t28: i32 = ADD_R t27, t31, TargetConstant:i32<0>
 ; CHECK-NEXT:    t15: ch,glue = CopyToReg t0, Register:i32 $rv, t28
 ; CHECK-NEXT:    t17: ch,glue = CopyToReg t15, Register:i32 $r9, t24, t15:1
 ; CHECK-NEXT:    t18: ch = RET Register:i32 $rv, Register:i32 $r9, t17, t17:1
diff --git a/llvm/test/tools/dxil-dis/metadata.ll b/llvm/test/tools/dxil-dis/metadata.ll
index 758860a2deb8f..18f2530ab8fc2 100644
--- a/llvm/test/tools/dxil-dis/metadata.ll
+++ b/llvm/test/tools/dxil-dis/metadata.ll
@@ -1,13 +1,21 @@
-; RUN: llc --filetype=obj %s -o - | dxil-dis 
+; RUN: llc --filetype=obj %s -o - | dxil-dis
 target triple = "dxil-unknown-shadermodel6.7-library"
 
+define void @kernel(ptr addrspace(1)) {
+    ret void
+}
+
 !llvm.foo = !{!0}
 !llvm.bar = !{!1}
+!llvm.baz = !{!2}
 
 !0 = !{i32 42}
 !1 = !{!"Some MDString"}
+!2 = !{ptr @kernel}
 
 ; CHECK: !llvm.foo = !{!0}
 ; CHECK: !llvm.bar = !{!1}
+; CHECK: !llvm.baz = !{!2}
 ; CHECK: !0 = !{i32 42}
 ; CHECK: !1 = !{!"Some MDString"}
+; CHECK: !2 = !{void (i8 addrspace(1)*)* @kernel}
diff --git a/llvm/tools/dxil-dis/CMakeLists.txt b/llvm/tools/dxil-dis/CMakeLists.txt
index 9addf108a8614..d0541fcf802e9 100644
--- a/llvm/tools/dxil-dis/CMakeLists.txt
+++ b/llvm/tools/dxil-dis/CMakeLists.txt
@@ -25,7 +25,9 @@ include(ExternalProject)
 
 set(SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/DXC-src)
 set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/DXC-bins)
-set(GIT_SETTINGS GIT_REPOSITORY https://github.com/microsoft/DirectXShaderCompiler.git)
+set(GIT_SETTINGS
+    GIT_REPOSITORY https://github.com/microsoft/DirectXShaderCompiler.git
+    GIT_TAG main)
 
 if (DXC_SOURCE_DIR)
   set(SOURCE_DIR ${DXC_SOURCE_DIR})
diff --git a/llvm/tools/llvm-debuginfod-find/CMakeLists.txt b/llvm/tools/llvm-debuginfod-find/CMakeLists.txt
index b98c431c1839b..39da11fcd9599 100644
--- a/llvm/tools/llvm-debuginfod-find/CMakeLists.txt
+++ b/llvm/tools/llvm-debuginfod-find/CMakeLists.txt
@@ -1,11 +1,21 @@
 set(LLVM_LINK_COMPONENTS
+  Option
   Object
   Support
   )
+set(LLVM_TARGET_DEFINITIONS Opts.td)
+tablegen(LLVM Opts.inc -gen-opt-parser-defs)
+add_public_tablegen_target(DebugInfodFindOptsTableGen)
+
 add_llvm_tool(llvm-debuginfod-find
   llvm-debuginfod-find.cpp
+  DEPENDS
+  DebugInfodFindOptsTableGen
+  GENERATE_DRIVER
   )
-target_link_libraries(llvm-debuginfod-find PRIVATE LLVMDebuginfod)
+if(NOT LLVM_TOOL_LLVM_DRIVER_BUILD)
+  target_link_libraries(llvm-debuginfod-find PRIVATE LLVMDebuginfod)
+endif()
 if(LLVM_INSTALL_BINUTILS_SYMLINKS)
   add_llvm_tool_symlink(debuginfod-find llvm-debuginfod-find)
 endif()
diff --git a/llvm/tools/llvm-debuginfod-find/Opts.td b/llvm/tools/llvm-debuginfod-find/Opts.td
new file mode 100644
index 0000000000000..a770f50d241a2
--- /dev/null
+++ b/llvm/tools/llvm-debuginfod-find/Opts.td
@@ -0,0 +1,17 @@
+include "llvm/Option/OptParser.td"
+
+class F<string name, string help> : Flag<["-"], name>, HelpText<help>;
+class FF<string name, string help>: Flag<["--"], name>, HelpText<help>;
+class S<string name, string meta, string help>: Separate<["--"], name>, HelpText<help>, MetaVarName<meta>;
+
+def help : FF<"help", "Display available options">;
+def : F<"h", "Alias for --help">, Alias<help>;
+
+def fetch_executable : FF<"executable", "If set, fetch a binary file associated with this build id, containing the executable sections.">;
+def fetch_debuginfo : FF<"debuginfo", "If set, fetch a binary file associated with this build id, containing the debuginfo sections.">;
+def fetch_source : S<"source", "<string>", "Fetch a source file associated with this build id, which is at this relative path relative to the compilation directory.">;
+def dump_to_stdout : FF<"dump", "If set, dumps the contents of the fetched artifact "
+                          "to standard output. Otherwise, dumps the absolute "
+                          "path to the cached artifact on disk.">;
+def debug_file_directory : S<"debug-file-directory", "<string>", "Path to directory where to look for debug files.">;
+
diff --git a/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp b/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp
index 425ee8d986a82..1f4404aaa391f 100644
--- a/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp
+++ b/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp
@@ -16,14 +16,89 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Debuginfod/BuildIDFetcher.h"
 #include "llvm/Debuginfod/Debuginfod.h"
 #include "llvm/Debuginfod/HTTPClient.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/LLVMDriver.h"
 
 using namespace llvm;
 
+// Command-line option boilerplate.
+namespace {
+enum ID {
+  OPT_INVALID = 0, // This is not an option ID.
+#define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__),
+#include "Opts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE)                                                    \
+  static constexpr StringLiteral NAME##_init[] = VALUE;                        \
+  static constexpr ArrayRef<StringLiteral> NAME(NAME##_init,                   \
+                                                std::size(NAME##_init) - 1);
+#include "Opts.inc"
+#undef PREFIX
+
+using namespace llvm::opt;
+static constexpr opt::OptTable::Info InfoTable[] = {
+#define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__),
+#include "Opts.inc"
+#undef OPTION
+};
+
+class DebuginfodFindOptTable : public opt::GenericOptTable {
+public:
+  DebuginfodFindOptTable() : GenericOptTable(InfoTable) {}
+};
+
+} // end anonymous namespace
+
+static std::string InputBuildID;
+static bool FetchExecutable;
+static bool FetchDebuginfo;
+static std::string FetchSource;
+static bool DumpToStdout;
+static std::vector<std::string> DebugFileDirectory;
+
+static void parseArgs(int argc, char **argv) {
+  DebuginfodFindOptTable Tbl;
+  llvm::StringRef ToolName = argv[0];
+  llvm::BumpPtrAllocator A;
+  llvm::StringSaver Saver{A};
+  opt::InputArgList Args =
+      Tbl.parseArgs(argc, argv, OPT_UNKNOWN, Saver, [&](StringRef Msg) {
+        llvm::errs() << Msg << '\n';
+        std::exit(1);
+      });
+
+  if (Args.hasArg(OPT_help)) {
+    Tbl.printHelp(llvm::outs(),
+                  "llvm-debuginfod-find [options] <input build_id>",
+                  ToolName.str().c_str());
+    std::exit(0);
+  }
+
+  InputBuildID = Args.getLastArgValue(OPT_INPUT);
+
+  FetchExecutable = Args.hasArg(OPT_fetch_executable);
+  FetchDebuginfo = Args.hasArg(OPT_fetch_debuginfo);
+  DumpToStdout = Args.hasArg(OPT_dump_to_stdout);
+  FetchSource = Args.getLastArgValue(OPT_fetch_source, "");
+  DebugFileDirectory = Args.getAllArgValues(OPT_debug_file_directory);
+}
+
+[[noreturn]] static void helpExit() {
+  errs() << "Must specify exactly one of --executable, "
+            "--source=/path/to/file, or --debuginfo.\n";
+  exit(1);
+}
+
+/*
 cl::OptionCategory DebuginfodFindCategory("llvm-debuginfod-find Options");
 
 cl::opt<std::string> InputBuildID(cl::Positional, cl::Required,
@@ -60,30 +135,17 @@ static cl::list<std::string> DebugFileDirectory(
     cl::desc("Path to directory where to look for debug files."),
     cl::cat(DebuginfodFindCategory));
 
-[[noreturn]] static void helpExit() {
-  errs() << "Must specify exactly one of --executable, "
-            "--source=/path/to/file, or --debuginfo.";
-  exit(1);
-}
+*/
 
-ExitOnError ExitOnErr;
+ExitOnError ExitOnDebuginfodFindError;
 
 static std::string fetchDebugInfo(object::BuildIDRef BuildID);
 
-int main(int argc, char **argv) {
-  InitLLVM X(argc, argv);
+int llvm_debuginfod_find_main(int argc, char **argv,
+                              const llvm::ToolContext &) {
+  // InitLLVM X(argc, argv);
   HTTPClient::initialize();
-
-  cl::HideUnrelatedOptions({&DebuginfodFindCategory});
-  cl::ParseCommandLineOptions(
-      argc, argv,
-      "llvm-debuginfod-find: Fetch debuginfod artifacts\n\n"
-      "This program is a frontend to the debuginfod client library. The cache "
-      "directory, request timeout (in seconds), and debuginfod server urls are "
-      "set by these environment variables:\n"
-      "DEBUGINFOD_CACHE_PATH (default set by sys::path::cache_directory)\n"
-      "DEBUGINFOD_TIMEOUT (defaults to 90s)\n"
-      "DEBUGINFOD_URLS=[comma separated URLs] (defaults to empty)\n");
+  parseArgs(argc, argv);
 
   if (FetchExecutable + FetchDebuginfo + (FetchSource != "") != 1)
     helpExit();
@@ -97,9 +159,10 @@ int main(int argc, char **argv) {
 
   std::string Path;
   if (FetchSource != "")
-    Path = ExitOnErr(getCachedOrDownloadSource(ID, FetchSource));
+    Path =
+        ExitOnDebuginfodFindError(getCachedOrDownloadSource(ID, FetchSource));
   else if (FetchExecutable)
-    Path = ExitOnErr(getCachedOrDownloadExecutable(ID));
+    Path = ExitOnDebuginfodFindError(getCachedOrDownloadExecutable(ID));
   else if (FetchDebuginfo)
     Path = fetchDebugInfo(ID);
   else
@@ -110,11 +173,13 @@ int main(int argc, char **argv) {
     // Print the contents of the artifact.
     ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(
         Path, /*IsText=*/false, /*RequiresNullTerminator=*/false);
-    ExitOnErr(errorCodeToError(Buf.getError()));
+    ExitOnDebuginfodFindError(errorCodeToError(Buf.getError()));
     outs() << Buf.get()->getBuffer();
   } else
     // Print the path to the cached artifact file.
     outs() << Path << "\n";
+
+  return 0;
 }
 
 // Find a debug file in local build ID directories and via debuginfod.
diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp
index 65992d8cb95ee..3517f0e32b1bb 100644
--- a/llvm/unittests/SandboxIR/PassTest.cpp
+++ b/llvm/unittests/SandboxIR/PassTest.cpp
@@ -9,6 +9,7 @@
 #include "llvm/SandboxIR/Pass.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Module.h"
+#include "llvm/SandboxIR/PassManager.h"
 #include "llvm/SandboxIR/SandboxIR.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
@@ -82,3 +83,82 @@ define void @foo() {
   EXPECT_DEATH(TestNamePass("-dash"), ".*start with.*");
 #endif
 }
+
+TEST_F(PassTest, FunctionPassManager) {
+  auto *F = parseFunction(R"IR(
+define void @foo() {
+  ret void
+}
+)IR",
+                          "foo");
+  class TestPass1 final : public FunctionPass {
+    unsigned &BBCnt;
+
+  public:
+    TestPass1(unsigned &BBCnt) : FunctionPass("test-pass1"), BBCnt(BBCnt) {}
+    bool runOnFunction(Function &F) final {
+      for ([[maybe_unused]] auto &BB : F)
+        ++BBCnt;
+      return false;
+    }
+  };
+  class TestPass2 final : public FunctionPass {
+    unsigned &BBCnt;
+
+  public:
+    TestPass2(unsigned &BBCnt) : FunctionPass("test-pass2"), BBCnt(BBCnt) {}
+    bool runOnFunction(Function &F) final {
+      for ([[maybe_unused]] auto &BB : F)
+        ++BBCnt;
+      return false;
+    }
+  };
+  unsigned BBCnt1 = 0;
+  unsigned BBCnt2 = 0;
+  TestPass1 TPass1(BBCnt1);
+  TestPass2 TPass2(BBCnt2);
+
+  FunctionPassManager FPM("test-fpm");
+  FPM.addPass(&TPass1);
+  FPM.addPass(&TPass2);
+  // Check runOnFunction().
+  FPM.runOnFunction(*F);
+  EXPECT_EQ(BBCnt1, 1u);
+  EXPECT_EQ(BBCnt2, 1u);
+#ifndef NDEBUG
+  // Check dump().
+  std::string Buff;
+  llvm::raw_string_ostream SS(Buff);
+  FPM.print(SS);
+  EXPECT_EQ(Buff, "test-fpm(test-pass1,test-pass2)");
+#endif // NDEBUG
+}
+
+TEST_F(PassTest, PassRegistry) {
+  class TestPass1 final : public FunctionPass {
+  public:
+    TestPass1() : FunctionPass("test-pass1") {}
+    bool runOnFunction(Function &F) final { return false; }
+  };
+  class TestPass2 final : public FunctionPass {
+  public:
+    TestPass2() : FunctionPass("test-pass2") {}
+    bool runOnFunction(Function &F) final { return false; }
+  };
+
+  PassRegistry Registry;
+  auto &TP1 = Registry.registerPass(std::make_unique<TestPass1>());
+  auto &TP2 = Registry.registerPass(std::make_unique<TestPass2>());
+
+  // Check getPassByName().
+  EXPECT_EQ(Registry.getPassByName("test-pass1"), &TP1);
+  EXPECT_EQ(Registry.getPassByName("test-pass2"), &TP2);
+
+#ifndef NDEBUG
+  // Check print().
+  std::string Buff;
+  llvm::raw_string_ostream SS(Buff);
+  Registry.print(SS);
+  EXPECT_EQ(Buff, "test-pass1\ntest-pass2\n");
+#endif // NDEBUG
+}
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 1b939b4d047aa..b76d24dc297b9 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -729,6 +729,54 @@ define void @foo() {
   EXPECT_EQ(UndefStruct->getNumElements(), 2u);
 }
 
+TEST_F(SandboxIRTest, BlockAddress) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr) {
+bb0:
+  store ptr blockaddress(@foo, %bb0), ptr %ptr
+  ret void
+bb1:
+  ret void
+bb2:
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *BB1 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb1")));
+  auto *BB2 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb2")));
+  auto It = BB0->begin();
+  auto *SI = cast<sandboxir::StoreInst>(&*It++);
+  [[maybe_unused]] auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  // Check classof(), creation, getFunction(), getBasicBlock().
+  auto *BB0Addr = cast<sandboxir::BlockAddress>(SI->getValueOperand());
+  EXPECT_EQ(BB0Addr->getBasicBlock(), BB0);
+  EXPECT_EQ(BB0Addr->getFunction(), &F);
+  // Check get(F, BB).
+  auto *NewBB0Addr = sandboxir::BlockAddress::get(&F, BB0);
+  EXPECT_EQ(NewBB0Addr, BB0Addr);
+  // Check get(BB).
+  auto *NewBB0Addr2 = sandboxir::BlockAddress::get(BB0);
+  EXPECT_EQ(NewBB0Addr2, BB0Addr);
+  auto *BB1Addr = sandboxir::BlockAddress::get(BB1);
+  EXPECT_EQ(BB1Addr->getBasicBlock(), BB1);
+  EXPECT_NE(BB1Addr, BB0Addr);
+  // Check lookup().
+  auto *LookupBB0Addr = sandboxir::BlockAddress::lookup(BB0);
+  EXPECT_EQ(LookupBB0Addr, BB0Addr);
+  auto *LookupBB1Addr = sandboxir::BlockAddress::lookup(BB1);
+  EXPECT_EQ(LookupBB1Addr, BB1Addr);
+  auto *LookupBB2Addr = sandboxir::BlockAddress::lookup(BB2);
+  EXPECT_EQ(LookupBB2Addr, nullptr);
+}
+
 TEST_F(SandboxIRTest, Use) {
   parseIR(C, R"IR(
 define i32 @foo(i32 %v0, i32 %v1) {
diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp
index e4f9235c1ef3c..40aa32fb08ed0 100644
--- a/llvm/unittests/SandboxIR/TypesTest.cpp
+++ b/llvm/unittests/SandboxIR/TypesTest.cpp
@@ -323,6 +323,123 @@ define void @foo(<4 x i16> %vi0, <4 x float> %vf1, i8 %i0) {
   EXPECT_FALSE(sandboxir::VectorType::isValidElementType(FVecTy));
 }
 
+TEST_F(SandboxTypeTest, FixedVectorType) {
+  parseIR(C, R"IR(
+define void @foo(<4 x i16> %vi0, <4 x float> %vf1, i8 %i0) {
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  // Check classof(), creation, accessors
+  auto *Vec4i16Ty = cast<sandboxir::FixedVectorType>(F->getArg(0)->getType());
+  EXPECT_TRUE(Vec4i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec4i16Ty->getElementCount(), ElementCount::getFixed(4));
+
+  // get(ElementType, NumElements)
+  EXPECT_EQ(
+      sandboxir::FixedVectorType::get(sandboxir::Type::getInt16Ty(Ctx), 4),
+      F->getArg(0)->getType());
+  // get(ElementType, Other)
+  EXPECT_EQ(sandboxir::FixedVectorType::get(
+                sandboxir::Type::getInt16Ty(Ctx),
+                cast<sandboxir::FixedVectorType>(F->getArg(0)->getType())),
+            F->getArg(0)->getType());
+  auto *Vec4FTy = cast<sandboxir::FixedVectorType>(F->getArg(1)->getType());
+  EXPECT_TRUE(Vec4FTy->getElementType()->isFloatTy());
+  // getInteger
+  auto *Vec4i32Ty = sandboxir::FixedVectorType::getInteger(Vec4FTy);
+  EXPECT_TRUE(Vec4i32Ty->getElementType()->isIntegerTy(32));
+  EXPECT_EQ(Vec4i32Ty->getElementCount(), Vec4FTy->getElementCount());
+  // getExtendedElementCountVectorType
+  auto *Vec4i64Ty =
+      sandboxir::FixedVectorType::getExtendedElementVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec4i64Ty->getElementType()->isIntegerTy(32));
+  EXPECT_EQ(Vec4i64Ty->getElementCount(), Vec4i16Ty->getElementCount());
+  // getTruncatedElementVectorType
+  auto *Vec4i8Ty =
+      sandboxir::FixedVectorType::getTruncatedElementVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec4i8Ty->getElementType()->isIntegerTy(8));
+  EXPECT_EQ(Vec4i8Ty->getElementCount(), Vec4i8Ty->getElementCount());
+  // getSubdividedVectorType
+  auto *Vec8i8Ty =
+      sandboxir::FixedVectorType::getSubdividedVectorType(Vec4i16Ty, 1);
+  EXPECT_TRUE(Vec8i8Ty->getElementType()->isIntegerTy(8));
+  EXPECT_EQ(Vec8i8Ty->getElementCount(), ElementCount::getFixed(8));
+  // getNumElements
+  EXPECT_EQ(Vec8i8Ty->getNumElements(), 8u);
+  // getHalfElementsVectorType
+  auto *Vec2i16Ty =
+      sandboxir::FixedVectorType::getHalfElementsVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec2i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec2i16Ty->getElementCount(), ElementCount::getFixed(2));
+  // getDoubleElementsVectorType
+  auto *Vec8i16Ty =
+      sandboxir::FixedVectorType::getDoubleElementsVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec8i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec8i16Ty->getElementCount(), ElementCount::getFixed(8));
+}
+
+TEST_F(SandboxTypeTest, ScalableVectorType) {
+  parseIR(C, R"IR(
+define void @foo(<vscale x 4 x i16> %vi0, <vscale x 4 x float> %vf1, i8 %i0) {
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  // Check classof(), creation, accessors
+  auto *Vec4i16Ty =
+      cast<sandboxir::ScalableVectorType>(F->getArg(0)->getType());
+  EXPECT_TRUE(Vec4i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec4i16Ty->getMinNumElements(), 4u);
+
+  // get(ElementType, NumElements)
+  EXPECT_EQ(
+      sandboxir::ScalableVectorType::get(sandboxir::Type::getInt16Ty(Ctx), 4),
+      F->getArg(0)->getType());
+  // get(ElementType, Other)
+  EXPECT_EQ(sandboxir::ScalableVectorType::get(
+                sandboxir::Type::getInt16Ty(Ctx),
+                cast<sandboxir::ScalableVectorType>(F->getArg(0)->getType())),
+            F->getArg(0)->getType());
+  auto *Vec4FTy = cast<sandboxir::ScalableVectorType>(F->getArg(1)->getType());
+  EXPECT_TRUE(Vec4FTy->getElementType()->isFloatTy());
+  // getInteger
+  auto *Vec4i32Ty = sandboxir::ScalableVectorType::getInteger(Vec4FTy);
+  EXPECT_TRUE(Vec4i32Ty->getElementType()->isIntegerTy(32));
+  EXPECT_EQ(Vec4i32Ty->getMinNumElements(), Vec4FTy->getMinNumElements());
+  // getExtendedElementCountVectorType
+  auto *Vec4i64Ty =
+      sandboxir::ScalableVectorType::getExtendedElementVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec4i64Ty->getElementType()->isIntegerTy(32));
+  EXPECT_EQ(Vec4i64Ty->getMinNumElements(), Vec4i16Ty->getMinNumElements());
+  // getTruncatedElementVectorType
+  auto *Vec4i8Ty =
+      sandboxir::ScalableVectorType::getTruncatedElementVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec4i8Ty->getElementType()->isIntegerTy(8));
+  EXPECT_EQ(Vec4i8Ty->getMinNumElements(), Vec4i8Ty->getMinNumElements());
+  // getSubdividedVectorType
+  auto *Vec8i8Ty =
+      sandboxir::ScalableVectorType::getSubdividedVectorType(Vec4i16Ty, 1);
+  EXPECT_TRUE(Vec8i8Ty->getElementType()->isIntegerTy(8));
+  EXPECT_EQ(Vec8i8Ty->getMinNumElements(), 8u);
+  // getMinNumElements
+  EXPECT_EQ(Vec8i8Ty->getMinNumElements(), 8u);
+  // getHalfElementsVectorType
+  auto *Vec2i16Ty =
+      sandboxir::ScalableVectorType::getHalfElementsVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec2i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec2i16Ty->getMinNumElements(), 2u);
+  // getDoubleElementsVectorType
+  auto *Vec8i16Ty =
+      sandboxir::ScalableVectorType::getDoubleElementsVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec8i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec8i16Ty->getMinNumElements(), 8u);
+}
+
 TEST_F(SandboxTypeTest, FunctionType) {
   parseIR(C, R"IR(
 define void @foo() {
diff --git a/llvm/utils/TableGen/Common/OptEmitter.cpp b/llvm/utils/TableGen/Common/OptEmitter.cpp
index 7fcf3074e0931..75e32c36d4f72 100644
--- a/llvm/utils/TableGen/Common/OptEmitter.cpp
+++ b/llvm/utils/TableGen/Common/OptEmitter.cpp
@@ -1,4 +1,4 @@
-//===- OptEmitter.cpp - Helper for emitting options.----------- -----------===//
+//===- OptEmitter.cpp - Helper for emitting options -------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -39,21 +39,21 @@ static int StrCmpOptionName(const char *A, const char *B) {
   return (a < b) ? -1 : 1;
 }
 
-int CompareOptionRecords(Record *const *Av, Record *const *Bv) {
-  const Record *A = *Av;
-  const Record *B = *Bv;
-
+// Returns true if A is ordered before B.
+bool CompareOptionRecords(const Record *A, const Record *B) {
+  if (A == B)
+    return false;
   // Sentinel options precede all others and are only ordered by precedence.
   bool ASent = A->getValueAsDef("Kind")->getValueAsBit("Sentinel");
   bool BSent = B->getValueAsDef("Kind")->getValueAsBit("Sentinel");
   if (ASent != BSent)
-    return ASent ? -1 : 1;
+    return ASent;
 
   // Compare options by name, unless they are sentinels.
   if (!ASent)
     if (int Cmp = StrCmpOptionName(A->getValueAsString("Name").str().c_str(),
                                    B->getValueAsString("Name").str().c_str()))
-      return Cmp;
+      return Cmp < 0;
 
   if (!ASent) {
     std::vector<StringRef> APrefixes = A->getValueAsListOfStrings("Prefixes");
@@ -65,7 +65,7 @@ int CompareOptionRecords(Record *const *Av, Record *const *Bv) {
                                                 BEPre = BPrefixes.end();
          APre != AEPre && BPre != BEPre; ++APre, ++BPre) {
       if (int Cmp = StrCmpOptionName(APre->str().c_str(), BPre->str().c_str()))
-        return Cmp;
+        return Cmp < 0;
     }
   }
 
@@ -78,7 +78,7 @@ int CompareOptionRecords(Record *const *Av, Record *const *Bv) {
     PrintError(B->getLoc(), Twine("Other defined here"));
     PrintFatalError("Equivalent Options found.");
   }
-  return APrec < BPrec ? -1 : 1;
+  return APrec < BPrec;
 }
 
 } // namespace llvm
diff --git a/llvm/utils/TableGen/Common/OptEmitter.h b/llvm/utils/TableGen/Common/OptEmitter.h
index eaef966bbac66..5eecd61987337 100644
--- a/llvm/utils/TableGen/Common/OptEmitter.h
+++ b/llvm/utils/TableGen/Common/OptEmitter.h
@@ -1,4 +1,4 @@
-//===- OptEmitter.h - Helper for emitting options. --------------*- C++ -*-===//
+//===- OptEmitter.h - Helper for emitting options ---------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,7 +11,7 @@
 
 namespace llvm {
 class Record;
-int CompareOptionRecords(Record *const *Av, Record *const *Bv);
+bool CompareOptionRecords(const Record *A, const Record *B);
 } // namespace llvm
 
 #endif // LLVM_UTILS_TABLEGEN_COMMON_OPTEMITTER_H
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index 1b93e3d5e3b70..a14cc3d6b844c 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -71,9 +71,9 @@ class MatcherTableEmitter {
   MapVector<std::string, unsigned, StringMap<unsigned>> VecPatterns;
 
   unsigned getPatternIdxFromTable(std::string &&P, std::string &&include_loc) {
-    const auto It = VecPatterns.find(P);
-    if (It == VecPatterns.end()) {
-      VecPatterns.insert(std::pair(std::move(P), VecPatterns.size()));
+    const auto [It, Inserted] =
+        VecPatterns.try_emplace(std::move(P), VecPatterns.size());
+    if (Inserted) {
       VecIncludeStrings.push_back(std::move(include_loc));
       return VecIncludeStrings.size() - 1;
     }
diff --git a/llvm/utils/TableGen/ExegesisEmitter.cpp b/llvm/utils/TableGen/ExegesisEmitter.cpp
index 0de7cb4233748..a5dd2994b3753 100644
--- a/llvm/utils/TableGen/ExegesisEmitter.cpp
+++ b/llvm/utils/TableGen/ExegesisEmitter.cpp
@@ -30,7 +30,7 @@ namespace {
 
 class ExegesisEmitter {
 public:
-  ExegesisEmitter(RecordKeeper &RK);
+  ExegesisEmitter(const RecordKeeper &RK);
 
   void run(raw_ostream &OS) const;
 
@@ -51,7 +51,7 @@ class ExegesisEmitter {
 
   void emitPfmCountersLookupTable(raw_ostream &OS) const;
 
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
   std::string Target;
 
   // Table of counter name -> counter index.
@@ -59,7 +59,7 @@ class ExegesisEmitter {
 };
 
 static std::map<llvm::StringRef, unsigned>
-collectPfmCounters(RecordKeeper &Records) {
+collectPfmCounters(const RecordKeeper &Records) {
   std::map<llvm::StringRef, unsigned> PfmCounterNameTable;
   const auto AddPfmCounterName = [&PfmCounterNameTable](
                                      const Record *PfmCounterDef) {
@@ -67,7 +67,8 @@ collectPfmCounters(RecordKeeper &Records) {
     if (!Counter.empty())
       PfmCounterNameTable.emplace(Counter, 0);
   };
-  for (Record *Def : Records.getAllDerivedDefinitions("ProcPfmCounters")) {
+  for (const Record *Def :
+       Records.getAllDerivedDefinitions("ProcPfmCounters")) {
     // Check that ResourceNames are unique.
     llvm::SmallSet<llvm::StringRef, 16> Seen;
     for (const Record *IssueCounter :
@@ -95,9 +96,9 @@ collectPfmCounters(RecordKeeper &Records) {
   return PfmCounterNameTable;
 }
 
-ExegesisEmitter::ExegesisEmitter(RecordKeeper &RK)
+ExegesisEmitter::ExegesisEmitter(const RecordKeeper &RK)
     : Records(RK), PfmCounterNameTable(collectPfmCounters(RK)) {
-  std::vector<Record *> Targets = Records.getAllDerivedDefinitions("Target");
+  ArrayRef<const Record *> Targets = Records.getAllDerivedDefinitions("Target");
   if (Targets.size() == 0)
     PrintFatalError("No 'Target' subclasses defined!");
   if (Targets.size() != 1)
@@ -223,7 +224,7 @@ void ExegesisEmitter::emitPfmCounters(raw_ostream &OS) const {
 } // namespace
 
 void ExegesisEmitter::emitPfmCountersLookupTable(raw_ostream &OS) const {
-  std::vector<Record *> Bindings =
+  std::vector<const Record *> Bindings =
       Records.getAllDerivedDefinitions("PfmCountersBinding");
   assert(!Bindings.empty() && "there must be at least one binding");
   llvm::sort(Bindings, [](const Record *L, const Record *R) {
@@ -232,7 +233,7 @@ void ExegesisEmitter::emitPfmCountersLookupTable(raw_ostream &OS) const {
 
   OS << "// Sorted (by CpuName) array of pfm counters.\n"
      << "static const CpuAndPfmCounters " << Target << "CpuPfmCounters[] = {\n";
-  for (Record *Binding : Bindings) {
+  for (const Record *Binding : Bindings) {
     // Emit as { "cpu", procinit },
     OS << "  { \""                                                        //
        << Binding->getValueAsString("CpuName") << "\","                   //
diff --git a/llvm/utils/TableGen/OptParserEmitter.cpp b/llvm/utils/TableGen/OptParserEmitter.cpp
index 81195c8c106c2..a41c684f169e9 100644
--- a/llvm/utils/TableGen/OptParserEmitter.cpp
+++ b/llvm/utils/TableGen/OptParserEmitter.cpp
@@ -250,15 +250,15 @@ static void EmitHelpTextsForVariants(
 /// OptParserEmitter - This tablegen backend takes an input .td file
 /// describing a list of options and emits a data structure for parsing and
 /// working with those options when given an input command line.
-static void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
+static void EmitOptParser(const RecordKeeper &Records, raw_ostream &OS) {
   // Get the option groups and options.
-  const std::vector<Record *> &Groups =
+  ArrayRef<const Record *> Groups =
       Records.getAllDerivedDefinitions("OptionGroup");
-  std::vector<Record *> Opts = Records.getAllDerivedDefinitions("Option");
+  std::vector<const Record *> Opts = Records.getAllDerivedDefinitions("Option");
 
   emitSourceFileHeader("Option Parsing Definitions", OS);
 
-  array_pod_sort(Opts.begin(), Opts.end(), CompareOptionRecords);
+  llvm::sort(Opts, CompareOptionRecords);
   // Generate prefix groups.
   typedef SmallVector<SmallString<2>, 2> PrefixKeyT;
   typedef std::map<PrefixKeyT, std::string> PrefixesT;
diff --git a/llvm/utils/TableGen/OptRSTEmitter.cpp b/llvm/utils/TableGen/OptRSTEmitter.cpp
index 75b7cbdf29887..43b0f78c44d90 100644
--- a/llvm/utils/TableGen/OptRSTEmitter.cpp
+++ b/llvm/utils/TableGen/OptRSTEmitter.cpp
@@ -16,30 +16,24 @@ using namespace llvm;
 
 /// OptParserEmitter - This tablegen backend takes an input .td file
 /// describing a list of options and emits a RST man page.
-static void EmitOptRST(RecordKeeper &Records, raw_ostream &OS) {
-  llvm::StringMap<std::vector<Record *>> OptionsByGroup;
+static void EmitOptRST(const RecordKeeper &Records, raw_ostream &OS) {
+  llvm::StringMap<std::vector<const Record *>> OptionsByGroup;
   std::vector<Record *> OptionsWithoutGroup;
 
   // Get the options.
-  std::vector<Record *> Opts = Records.getAllDerivedDefinitions("Option");
-  array_pod_sort(Opts.begin(), Opts.end(), CompareOptionRecords);
+  std::vector<const Record *> Opts = Records.getAllDerivedDefinitions("Option");
+  llvm::sort(Opts, CompareOptionRecords);
 
   // Get the option groups.
-  const std::vector<Record *> &Groups =
-      Records.getAllDerivedDefinitions("OptionGroup");
-  for (unsigned i = 0, e = Groups.size(); i != e; ++i) {
-    const Record &R = *Groups[i];
-    OptionsByGroup.try_emplace(R.getValueAsString("Name"));
-  }
+  for (const Record *R : Records.getAllDerivedDefinitions("OptionGroup"))
+    OptionsByGroup.try_emplace(R->getValueAsString("Name"));
 
   // Map options to their group.
-  for (unsigned i = 0, e = Opts.size(); i != e; ++i) {
-    const Record &R = *Opts[i];
-    if (const DefInit *DI = dyn_cast<DefInit>(R.getValueInit("Group"))) {
-      OptionsByGroup[DI->getDef()->getValueAsString("Name")].push_back(Opts[i]);
-    } else {
-      OptionsByGroup["options"].push_back(Opts[i]);
-    }
+  for (const Record *R : Opts) {
+    if (const DefInit *DI = dyn_cast<DefInit>(R->getValueInit("Group")))
+      OptionsByGroup[DI->getDef()->getValueAsString("Name")].push_back(R);
+    else
+      OptionsByGroup["options"].push_back(R);
   }
 
   // Print options under their group.
@@ -49,7 +43,7 @@ static void EmitOptRST(RecordKeeper &Records, raw_ostream &OS) {
     OS << std::string(GroupName.size(), '-') << '\n';
     OS << '\n';
 
-    for (Record *R : KV.getValue()) {
+    for (const Record *R : KV.getValue()) {
       OS << ".. option:: ";
 
       // Print the prefix.
diff --git a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
index 28f250ad3b7ba..ff4f558ca2fcf 100644
--- a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
@@ -110,6 +110,7 @@ static_library("CodeGen") {
     "Targets/AVR.cpp",
     "Targets/BPF.cpp",
     "Targets/CSKY.cpp",
+    "Targets/DirectX.cpp",
     "Targets/Hexagon.cpp",
     "Targets/Lanai.cpp",
     "Targets/LoongArch.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
index aa49c76e84cec..e69104909330d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
@@ -6,6 +6,7 @@ static_library("SandboxIR") {
   ]
   sources = [
     "Pass.cpp",
+    "PassManager.cpp",
     "SandboxIR.cpp",
     "Tracker.cpp",
     "Type.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn
index 009aba221a0bc..e296a7b93c760 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn
@@ -18,6 +18,7 @@ static_library("Coroutines") {
     "CoroFrame.cpp",
     "CoroSplit.cpp",
     "Coroutines.cpp",
+    "SpillUtils.cpp",
     "SuspendCrossingInfo.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn
index 16b2c53438314..6b926bc777dca 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn
@@ -1,6 +1,13 @@
 import("//llvm/tools/binutils_symlinks.gni")
+import("//llvm/utils/TableGen/tablegen.gni")
+import("//llvm/utils/gn/build/driver_executable.gni")
 import("//llvm/utils/gn/build/symlink_or_copy.gni")
 
+tablegen("Opts") {
+  visibility = [ ":llvm-debuginfod-find" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
 if (llvm_install_binutils_symlinks) {
   symlink_or_copy("debuginfod-find") {
     deps = [ ":llvm-debuginfod-find" ]
@@ -18,9 +25,11 @@ group("symlinks") {
   }
 }
 
-executable("llvm-debuginfod-find") {
+driver_executable("llvm-debuginfod-find") {
   deps = [
+    ":Opts",
     "//llvm/lib/Debuginfod",
+    "//llvm/lib/Option",
     "//llvm/lib/Support",
   ]
   sources = [ "llvm-debuginfod-find.cpp" ]
diff --git a/mlir/include/mlir-c/BuiltinAttributes.h b/mlir/include/mlir-c/BuiltinAttributes.h
index 231eb83b5e269..7c8c84e55b962 100644
--- a/mlir/include/mlir-c/BuiltinAttributes.h
+++ b/mlir/include/mlir-c/BuiltinAttributes.h
@@ -16,6 +16,7 @@
 
 #include "mlir-c/AffineMap.h"
 #include "mlir-c/IR.h"
+#include "mlir-c/IntegerSet.h"
 #include "mlir-c/Support.h"
 
 #ifdef __cplusplus
@@ -177,6 +178,14 @@ MLIR_CAPI_EXPORTED bool mlirBoolAttrGetValue(MlirAttribute attr);
 /// Checks whether the given attribute is an integer set attribute.
 MLIR_CAPI_EXPORTED bool mlirAttributeIsAIntegerSet(MlirAttribute attr);
 
+/// Creates an integer set attribute wrapping the given set. The attribute
+/// belongs to the same context as the integer set.
+MLIR_CAPI_EXPORTED MlirAttribute mlirIntegerSetAttrGet(MlirIntegerSet set);
+
+/// Returns the integer set wrapped in the given integer set attribute.
+MLIR_CAPI_EXPORTED MlirIntegerSet
+mlirIntegerSetAttrGetValue(MlirAttribute attr);
+
 /// Returns the typeID of an IntegerSet attribute.
 MLIR_CAPI_EXPORTED MlirTypeID mlirIntegerSetAttrGetTypeID(void);
 
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index ac61117c3d6e3..31f2913924726 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -29,6 +29,7 @@ class LinalgStructuredBase_Op<string mnemonic, list<Trait> props>
   : Op<Linalg_Dialect, mnemonic, !listconcat([
        SingleBlockImplicitTerminator<"YieldOp">,
        DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+       DeclareOpInterfaceMethods<ConditionallySpeculatable>,
        DestinationStyleOpInterface,
        LinalgStructuredInterface,
        ReifyRankedShapedTypeOpInterface], props)> {
diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
index dd349d1392e7b..a65c6b1d3c96b 100644
--- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
@@ -17,6 +17,7 @@ mlir_tablegen(OpenMPOpsDialect.h.inc -gen-dialect-decls -dialect=omp)
 mlir_tablegen(OpenMPOpsDialect.cpp.inc -gen-dialect-defs -dialect=omp)
 mlir_tablegen(OpenMPOps.h.inc -gen-op-decls)
 mlir_tablegen(OpenMPOps.cpp.inc -gen-op-defs)
+mlir_tablegen(OpenMPClauseOps.h.inc -gen-openmp-clause-ops)
 mlir_tablegen(OpenMPOpsTypes.h.inc -gen-typedef-decls -typedefs-dialect=omp)
 mlir_tablegen(OpenMPOpsTypes.cpp.inc -gen-typedef-defs -typedefs-dialect=omp)
 mlir_tablegen(OpenMPOpsEnums.h.inc -gen-enum-decls)
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
index 38e4d8f245e4f..1247a871f93c6 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
@@ -23,303 +23,31 @@
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.h.inc"
 
+#include "mlir/Dialect/OpenMP/OpenMPClauseOps.h.inc"
+
 namespace mlir {
 namespace omp {
 
 //===----------------------------------------------------------------------===//
-// Mixin structures defining MLIR operands associated with each OpenMP clause.
+// Extra clause operand structures.
 //===----------------------------------------------------------------------===//
 
-struct AlignedClauseOps {
-  llvm::SmallVector<Value> alignedVars;
-  llvm::SmallVector<Attribute> alignments;
-};
-
-struct AllocateClauseOps {
-  llvm::SmallVector<Value> allocateVars, allocatorVars;
-};
-
-struct CancelDirectiveNameClauseOps {
-  ClauseCancellationConstructTypeAttr cancelDirective;
-};
-
-struct CopyprivateClauseOps {
-  llvm::SmallVector<Value> copyprivateVars;
-  llvm::SmallVector<Attribute> copyprivateSyms;
-};
-
-struct CriticalNameClauseOps {
-  /// This field has a generic name because it's mirroring the `sym_name`
-  /// argument of the `OpenMP_CriticalNameClause` tablegen definition. That one
-  /// can't be renamed to anything more specific because the `sym_name` name is
-  /// a requirement of the `Symbol` MLIR trait associated with that clause.
-  StringAttr symName;
-};
-
-struct DependClauseOps {
-  llvm::SmallVector<Attribute> dependKinds;
-  llvm::SmallVector<Value> dependVars;
-};
-
-struct DeviceClauseOps {
-  Value device;
-};
-
 struct DeviceTypeClauseOps {
-  // The default capture type.
+  /// The default capture type.
   DeclareTargetDeviceType deviceType = DeclareTargetDeviceType::any;
 };
 
-struct DistScheduleClauseOps {
-  UnitAttr distScheduleStatic;
-  Value distScheduleChunkSize;
-};
-
-struct DoacrossClauseOps {
-  ClauseDependAttr doacrossDependType;
-  IntegerAttr doacrossNumLoops;
-  llvm::SmallVector<Value> doacrossDependVars;
-};
-
-struct FilterClauseOps {
-  Value filteredThreadId;
-};
-
-struct FinalClauseOps {
-  Value final;
-};
-
-struct GrainsizeClauseOps {
-  Value grainsize;
-};
-
-struct HasDeviceAddrClauseOps {
-  llvm::SmallVector<Value> hasDeviceAddrVars;
-};
-
-struct HintClauseOps {
-  IntegerAttr hint;
-};
-
-struct IfClauseOps {
-  Value ifVar;
-};
-
-struct InReductionClauseOps {
-  llvm::SmallVector<Value> inReductionVars;
-  llvm::SmallVector<bool> inReductionByref;
-  llvm::SmallVector<Attribute> inReductionSyms;
-};
-
-struct IsDevicePtrClauseOps {
-  llvm::SmallVector<Value> isDevicePtrVars;
-};
-
-struct LinearClauseOps {
-  llvm::SmallVector<Value> linearVars, linearStepVars;
-};
-
-struct LoopRelatedOps {
-  llvm::SmallVector<Value> loopLowerBounds, loopUpperBounds, loopSteps;
-  UnitAttr loopInclusive;
-};
-
-struct MapClauseOps {
-  llvm::SmallVector<Value> mapVars;
-};
-
-struct MergeableClauseOps {
-  UnitAttr mergeable;
-};
-
-struct NogroupClauseOps {
-  UnitAttr nogroup;
-};
-
-struct NontemporalClauseOps {
-  llvm::SmallVector<Value> nontemporalVars;
-};
-
-struct NowaitClauseOps {
-  UnitAttr nowait;
-};
-
-struct NumTasksClauseOps {
-  Value numTasks;
-};
-
-struct NumTeamsClauseOps {
-  Value numTeamsLower, numTeamsUpper;
-};
-
-struct NumThreadsClauseOps {
-  Value numThreads;
-};
-
-struct OrderClauseOps {
-  ClauseOrderKindAttr order;
-  OrderModifierAttr orderMod;
-};
-
-struct OrderedClauseOps {
-  IntegerAttr ordered;
-};
-
-struct ParallelizationLevelClauseOps {
-  UnitAttr parLevelSimd;
-};
-
-struct PriorityClauseOps {
-  Value priority;
-};
-
-struct PrivateClauseOps {
-  // SSA values that correspond to "original" values being privatized.
-  // They refer to the SSA value outside the OpenMP region from which a clone is
-  // created inside the region.
-  llvm::SmallVector<Value> privateVars;
-  // The list of symbols referring to delayed privatizer ops (i.e. `omp.private`
-  // ops).
-  llvm::SmallVector<Attribute> privateSyms;
-};
-
-struct ProcBindClauseOps {
-  ClauseProcBindKindAttr procBindKind;
-};
-
-struct ReductionClauseOps {
-  llvm::SmallVector<Value> reductionVars;
-  llvm::SmallVector<bool> reductionByref;
-  llvm::SmallVector<Attribute> reductionSyms;
-};
-
-struct SafelenClauseOps {
-  IntegerAttr safelen;
-};
-
-struct ScheduleClauseOps {
-  ClauseScheduleKindAttr scheduleKind;
-  Value scheduleChunk;
-  ScheduleModifierAttr scheduleMod;
-  UnitAttr scheduleSimd;
-};
-
-struct SimdlenClauseOps {
-  IntegerAttr simdlen;
-};
-
-struct TaskReductionClauseOps {
-  llvm::SmallVector<Value> taskReductionVars;
-  llvm::SmallVector<bool> taskReductionByref;
-  llvm::SmallVector<Attribute> taskReductionSyms;
-};
-
-struct ThreadLimitClauseOps {
-  Value threadLimit;
-};
-
-struct UntiedClauseOps {
-  UnitAttr untied;
-};
-
-struct UseDeviceAddrClauseOps {
-  llvm::SmallVector<Value> useDeviceAddrVars;
-};
-
-struct UseDevicePtrClauseOps {
-  llvm::SmallVector<Value> useDevicePtrVars;
-};
-
 //===----------------------------------------------------------------------===//
-// Structures defining clause operands associated with each OpenMP leaf
-// construct.
-//
-// These mirror the arguments expected by the corresponding OpenMP MLIR ops.
+// Extra operation operand structures.
 //===----------------------------------------------------------------------===//
 
-namespace detail {
-template <typename... Mixins>
-struct Clauses : public Mixins... {};
-} // namespace detail
-
-using CancelOperands =
-    detail::Clauses<CancelDirectiveNameClauseOps, IfClauseOps>;
-
-using CancellationPointOperands = detail::Clauses<CancelDirectiveNameClauseOps>;
-
-using CriticalDeclareOperands =
-    detail::Clauses<CriticalNameClauseOps, HintClauseOps>;
-
-// TODO `indirect` clause.
+// TODO: Add `indirect` clause.
 using DeclareTargetOperands = detail::Clauses<DeviceTypeClauseOps>;
 
-using DistributeOperands =
-    detail::Clauses<AllocateClauseOps, DistScheduleClauseOps, OrderClauseOps,
-                    PrivateClauseOps>;
-
-using LoopNestOperands = detail::Clauses<LoopRelatedOps>;
-
-using MaskedOperands = detail::Clauses<FilterClauseOps>;
-
-using OrderedOperands = detail::Clauses<DoacrossClauseOps>;
-
-using OrderedRegionOperands = detail::Clauses<ParallelizationLevelClauseOps>;
-
-using ParallelOperands =
-    detail::Clauses<AllocateClauseOps, IfClauseOps, NumThreadsClauseOps,
-                    PrivateClauseOps, ProcBindClauseOps, ReductionClauseOps>;
-
-using SectionsOperands = detail::Clauses<AllocateClauseOps, NowaitClauseOps,
-                                         PrivateClauseOps, ReductionClauseOps>;
-
-using SimdOperands =
-    detail::Clauses<AlignedClauseOps, IfClauseOps, LinearClauseOps,
-                    NontemporalClauseOps, OrderClauseOps, PrivateClauseOps,
-                    ReductionClauseOps, SafelenClauseOps, SimdlenClauseOps>;
-
-using SingleOperands = detail::Clauses<AllocateClauseOps, CopyprivateClauseOps,
-                                       NowaitClauseOps, PrivateClauseOps>;
-
-// TODO `defaultmap`, `uses_allocators` clauses.
-using TargetOperands =
-    detail::Clauses<AllocateClauseOps, DependClauseOps, DeviceClauseOps,
-                    HasDeviceAddrClauseOps, IfClauseOps, InReductionClauseOps,
-                    IsDevicePtrClauseOps, MapClauseOps, NowaitClauseOps,
-                    PrivateClauseOps, ThreadLimitClauseOps>;
-
-using TargetDataOperands =
-    detail::Clauses<DeviceClauseOps, IfClauseOps, MapClauseOps,
-                    UseDeviceAddrClauseOps, UseDevicePtrClauseOps>;
-
-using TargetEnterExitUpdateDataOperands =
-    detail::Clauses<DependClauseOps, DeviceClauseOps, IfClauseOps, MapClauseOps,
-                    NowaitClauseOps>;
-
-// TODO `affinity`, `detach` clauses.
-using TaskOperands =
-    detail::Clauses<AllocateClauseOps, DependClauseOps, FinalClauseOps,
-                    IfClauseOps, InReductionClauseOps, MergeableClauseOps,
-                    PriorityClauseOps, PrivateClauseOps, UntiedClauseOps>;
-
-using TaskgroupOperands =
-    detail::Clauses<AllocateClauseOps, TaskReductionClauseOps>;
-
-using TaskloopOperands =
-    detail::Clauses<AllocateClauseOps, FinalClauseOps, GrainsizeClauseOps,
-                    IfClauseOps, InReductionClauseOps, MergeableClauseOps,
-                    NogroupClauseOps, NumTasksClauseOps, PriorityClauseOps,
-                    PrivateClauseOps, ReductionClauseOps, UntiedClauseOps>;
-
-using TaskwaitOperands = detail::Clauses<DependClauseOps, NowaitClauseOps>;
-
-using TeamsOperands =
-    detail::Clauses<AllocateClauseOps, IfClauseOps, NumTeamsClauseOps,
-                    PrivateClauseOps, ReductionClauseOps, ThreadLimitClauseOps>;
-
-using WsloopOperands =
-    detail::Clauses<AllocateClauseOps, LinearClauseOps, NowaitClauseOps,
-                    OrderClauseOps, OrderedClauseOps, PrivateClauseOps,
-                    ReductionClauseOps, ScheduleClauseOps>;
+/// omp.target_enter_data, omp.target_exit_data and omp.target_update take the
+/// same clauses, so we give the structure to be shared by all of them a
+/// representative name.
+using TargetEnterExitUpdateDataOperands = TargetEnterDataOperands;
 
 } // namespace omp
 } // namespace mlir
diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td
index 09eab50f53a54..0a1521f8ddfb8 100644
--- a/mlir/include/mlir/IR/CommonTypeConstraints.td
+++ b/mlir/include/mlir/IR/CommonTypeConstraints.td
@@ -198,7 +198,10 @@ class AllOfType<list<Type> allowedTypeList, string summary = "",
 class ConfinedType<Type type, list<Pred> predicates, string summary = "",
                    string cppType = type.cppType> : Type<
     And<!listconcat([type.predicate], !foreach(pred, predicates, pred))>,
-    summary, cppType>;
+    summary, cppType> {
+    Type baseType = type;
+    list<Pred> predicateList = predicates;
+}
 
 // Integer types.
 
diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp
index b4049bd7972d4..bfdd4a520af27 100644
--- a/mlir/lib/Bindings/Python/IRAttributes.cpp
+++ b/mlir/lib/Bindings/Python/IRAttributes.cpp
@@ -147,6 +147,26 @@ class PyAffineMapAttribute : public PyConcreteAttribute<PyAffineMapAttribute> {
   }
 };
 
+class PyIntegerSetAttribute
+    : public PyConcreteAttribute<PyIntegerSetAttribute> {
+public:
+  static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAIntegerSet;
+  static constexpr const char *pyClassName = "IntegerSetAttr";
+  using PyConcreteAttribute::PyConcreteAttribute;
+  static constexpr GetTypeIDFunctionTy getTypeIdFunction =
+      mlirIntegerSetAttrGetTypeID;
+
+  static void bindDerived(ClassTy &c) {
+    c.def_static(
+        "get",
+        [](PyIntegerSet &integerSet) {
+          MlirAttribute attr = mlirIntegerSetAttrGet(integerSet.get());
+          return PyIntegerSetAttribute(integerSet.getContext(), attr);
+        },
+        py::arg("integer_set"), "Gets an attribute wrapping an IntegerSet.");
+  }
+};
+
 template <typename T>
 static T pyTryCast(py::handle object) {
   try {
@@ -1426,7 +1446,6 @@ py::object symbolRefOrFlatSymbolRefAttributeCaster(PyAttribute &pyAttribute) {
 
 void mlir::python::populateIRAttributes(py::module &m) {
   PyAffineMapAttribute::bind(m);
-
   PyDenseBoolArrayAttribute::bind(m);
   PyDenseBoolArrayAttribute::PyDenseArrayIterator::bind(m);
   PyDenseI8ArrayAttribute::bind(m);
@@ -1466,6 +1485,7 @@ void mlir::python::populateIRAttributes(py::module &m) {
   PyOpaqueAttribute::bind(m);
   PyFloatAttribute::bind(m);
   PyIntegerAttribute::bind(m);
+  PyIntegerSetAttribute::bind(m);
   PyStringAttribute::bind(m);
   PyTypeAttribute::bind(m);
   PyGlobals::get().registerTypeCaster(
diff --git a/mlir/lib/CAPI/IR/BuiltinAttributes.cpp b/mlir/lib/CAPI/IR/BuiltinAttributes.cpp
index 726af884668b2..11d1ade552f5a 100644
--- a/mlir/lib/CAPI/IR/BuiltinAttributes.cpp
+++ b/mlir/lib/CAPI/IR/BuiltinAttributes.cpp
@@ -10,6 +10,7 @@
 #include "mlir-c/Support.h"
 #include "mlir/CAPI/AffineMap.h"
 #include "mlir/CAPI/IR.h"
+#include "mlir/CAPI/IntegerSet.h"
 #include "mlir/CAPI/Support.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/Attributes.h"
@@ -192,6 +193,14 @@ MlirTypeID mlirIntegerSetAttrGetTypeID(void) {
   return wrap(IntegerSetAttr::getTypeID());
 }
 
+MlirAttribute mlirIntegerSetAttrGet(MlirIntegerSet set) {
+  return wrap(IntegerSetAttr::get(unwrap(set)));
+}
+
+MlirIntegerSet mlirIntegerSetAttrGetValue(MlirAttribute attr) {
+  return wrap(llvm::cast<IntegerSetAttr>(unwrap(attr)).getValue());
+}
+
 //===----------------------------------------------------------------------===//
 // Opaque attribute.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 53e18a2e9d299..687061e9988f8 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1104,7 +1104,10 @@ class VectorExtractOpConversion
     }
 
     // One-shot extraction of vector from array (only requires extractvalue).
-    if (isa<VectorType>(resultType)) {
+    // Except for extracting 1-element vectors.
+    if (isa<VectorType>(resultType) &&
+        position.size() !=
+            static_cast<size_t>(extractOp.getSourceVectorType().getRank())) {
       if (extractOp.hasDynamicPosition())
         return failure();
 
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
index 842d239cf6a51..4623b9667998c 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/X86Vector/Transforms.h"
@@ -45,6 +46,7 @@ struct ConvertVectorToLLVMPass
     registry.insert<LLVM::LLVMDialect>();
     registry.insert<arith::ArithDialect>();
     registry.insert<memref::MemRefDialect>();
+    registry.insert<tensor::TensorDialect>();
     if (armNeon)
       registry.insert<arm_neon::ArmNeonDialect>();
     if (armSVE)
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index d51d63f243ea0..85604eef2f283 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -465,9 +465,8 @@ bool AnalysisState::isValueRead(Value value) const {
 
   while (!workingSet.empty()) {
     OpOperand *uMaybeReading = workingSet.pop_back_val();
-    if (visited.contains(uMaybeReading))
+    if (!visited.insert(uMaybeReading).second)
       continue;
-    visited.insert(uMaybeReading);
 
     // Skip over all ops that neither read nor write (but create an alias).
     if (bufferizesToAliasOnly(*uMaybeReading))
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 76df3ecf2d2bd..630985d76a0eb 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -34,6 +34,7 @@
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
@@ -1202,6 +1203,20 @@ void GenericOp::getEffects(
   getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
 
+static Speculation::Speculatability
+getGenericSpeculatabilityImpl(LinalgOp linalgOp) {
+  // Operands with value semantics are speculatable, while operands with memory
+  // semantics are not.
+  if (!linalgOp.hasPureTensorSemantics())
+    return Speculation::NotSpeculatable;
+  // The body of the op can still have speculation in its region.
+  return Speculation::RecursivelySpeculatable;
+}
+
+Speculation::Speculatability GenericOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
 LogicalResult GenericOp::verify() { return success(); }
 
 namespace {
@@ -1553,6 +1568,10 @@ void MapOp::getEffects(
   getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
 
+Speculation::Speculatability MapOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
 //===----------------------------------------------------------------------===//
 // ReduceOp
 //===----------------------------------------------------------------------===//
@@ -1621,6 +1640,10 @@ void ReduceOp::getEffects(
   getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
 
+Speculation::Speculatability ReduceOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
 static ParseResult parseDenseI64ArrayAttr(OpAsmParser &parser,
                                           NamedAttrList &attributes,
                                           StringRef attributeName) {
@@ -1906,6 +1929,10 @@ void TransposeOp::getEffects(
   getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
 
+Speculation::Speculatability TransposeOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
 LogicalResult TransposeOp::fold(FoldAdaptor adaptor,
                                 SmallVectorImpl<OpFoldResult> &result) {
   // Only the tensor type is supported.
@@ -2134,6 +2161,10 @@ void BroadcastOp::getEffects(
   getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
 
+Speculation::Speculatability BroadcastOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
 void BroadcastOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                               MLIRContext *context) {
   results.add<EraseIdentityLinalgOp<BroadcastOp>>(context);
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 1a9b87f0d68c9..e4ed58f26016a 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1370,7 +1370,7 @@ static LogicalResult verifyMapClause(Operation *op, OperandRange mapVars) {
 
 void TargetDataOp::build(OpBuilder &builder, OperationState &state,
                          const TargetDataOperands &clauses) {
-  TargetDataOp::build(builder, state, clauses.device, clauses.ifVar,
+  TargetDataOp::build(builder, state, clauses.device, clauses.ifExpr,
                       clauses.mapVars, clauses.useDeviceAddrVars,
                       clauses.useDevicePtrVars);
 }
@@ -1395,7 +1395,7 @@ void TargetEnterDataOp::build(
   MLIRContext *ctx = builder.getContext();
   TargetEnterDataOp::build(builder, state,
                            makeArrayAttr(ctx, clauses.dependKinds),
-                           clauses.dependVars, clauses.device, clauses.ifVar,
+                           clauses.dependVars, clauses.device, clauses.ifExpr,
                            clauses.mapVars, clauses.nowait);
 }
 
@@ -1415,7 +1415,7 @@ void TargetExitDataOp::build(OpBuilder &builder, OperationState &state,
   MLIRContext *ctx = builder.getContext();
   TargetExitDataOp::build(builder, state,
                           makeArrayAttr(ctx, clauses.dependKinds),
-                          clauses.dependVars, clauses.device, clauses.ifVar,
+                          clauses.dependVars, clauses.device, clauses.ifExpr,
                           clauses.mapVars, clauses.nowait);
 }
 
@@ -1434,7 +1434,7 @@ void TargetUpdateOp::build(OpBuilder &builder, OperationState &state,
                            const TargetEnterExitUpdateDataOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   TargetUpdateOp::build(builder, state, makeArrayAttr(ctx, clauses.dependKinds),
-                        clauses.dependVars, clauses.device, clauses.ifVar,
+                        clauses.dependVars, clauses.device, clauses.ifExpr,
                         clauses.mapVars, clauses.nowait);
 }
 
@@ -1456,7 +1456,7 @@ void TargetOp::build(OpBuilder &builder, OperationState &state,
   // inReductionByref, inReductionSyms.
   TargetOp::build(builder, state, /*allocate_vars=*/{}, /*allocator_vars=*/{},
                   makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars,
-                  clauses.device, clauses.hasDeviceAddrVars, clauses.ifVar,
+                  clauses.device, clauses.hasDeviceAddrVars, clauses.ifExpr,
                   /*in_reduction_vars=*/{}, /*in_reduction_byref=*/nullptr,
                   /*in_reduction_syms=*/nullptr, clauses.isDevicePtrVars,
                   clauses.mapVars, clauses.nowait, clauses.privateVars,
@@ -1488,9 +1488,8 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
 void ParallelOp::build(OpBuilder &builder, OperationState &state,
                        const ParallelOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-
   ParallelOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
-                    clauses.ifVar, clauses.numThreads, clauses.privateVars,
+                    clauses.ifExpr, clauses.numThreads, clauses.privateVars,
                     makeArrayAttr(ctx, clauses.privateSyms),
                     clauses.procBindKind, clauses.reductionVars,
                     makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
@@ -1588,13 +1587,12 @@ void TeamsOp::build(OpBuilder &builder, OperationState &state,
                     const TeamsOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: privateVars, privateSyms.
-  TeamsOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
-                 clauses.ifVar, clauses.numTeamsLower, clauses.numTeamsUpper,
-                 /*private_vars=*/{},
-                 /*private_syms=*/nullptr, clauses.reductionVars,
-                 makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
-                 makeArrayAttr(ctx, clauses.reductionSyms),
-                 clauses.threadLimit);
+  TeamsOp::build(
+      builder, state, clauses.allocateVars, clauses.allocatorVars,
+      clauses.ifExpr, clauses.numTeamsLower, clauses.numTeamsUpper,
+      /*private_vars=*/{}, /*private_syms=*/nullptr, clauses.reductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+      makeArrayAttr(ctx, clauses.reductionSyms), clauses.threadLimit);
 }
 
 LogicalResult TeamsOp::verify() {
@@ -1814,7 +1812,7 @@ void SimdOp::build(OpBuilder &builder, OperationState &state,
   // TODO Store clauses in op: linearVars, linearStepVars, privateVars,
   // privateSyms, reductionVars, reductionByref, reductionSyms.
   SimdOp::build(builder, state, clauses.alignedVars,
-                makeArrayAttr(ctx, clauses.alignments), clauses.ifVar,
+                makeArrayAttr(ctx, clauses.alignments), clauses.ifExpr,
                 /*linear_vars=*/{}, /*linear_step_vars=*/{},
                 clauses.nontemporalVars, clauses.order, clauses.orderMod,
                 /*private_vars=*/{}, /*private_syms=*/nullptr,
@@ -1996,7 +1994,7 @@ void TaskOp::build(OpBuilder &builder, OperationState &state,
   // TODO Store clauses in op: privateVars, privateSyms.
   TaskOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
                 makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars,
-                clauses.final, clauses.ifVar, clauses.inReductionVars,
+                clauses.final, clauses.ifExpr, clauses.inReductionVars,
                 makeDenseBoolArrayAttr(ctx, clauses.inReductionByref),
                 makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable,
                 clauses.priority, /*private_vars=*/{}, /*private_syms=*/nullptr,
@@ -2042,7 +2040,7 @@ void TaskloopOp::build(OpBuilder &builder, OperationState &state,
   // TODO Store clauses in op: privateVars, privateSyms.
   TaskloopOp::build(
       builder, state, clauses.allocateVars, clauses.allocatorVars,
-      clauses.final, clauses.grainsize, clauses.ifVar, clauses.inReductionVars,
+      clauses.final, clauses.grainsize, clauses.ifExpr, clauses.inReductionVars,
       makeDenseBoolArrayAttr(ctx, clauses.inReductionByref),
       makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable,
       clauses.nogroup, clauses.numTasks, clauses.priority, /*private_vars=*/{},
@@ -2424,7 +2422,7 @@ LogicalResult AtomicCaptureOp::verifyRegions() {
 
 void CancelOp::build(OpBuilder &builder, OperationState &state,
                      const CancelOperands &clauses) {
-  CancelOp::build(builder, state, clauses.cancelDirective, clauses.ifVar);
+  CancelOp::build(builder, state, clauses.cancelDirective, clauses.ifExpr);
 }
 
 LogicalResult CancelOp::verify() {
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index 6420c192b257d..505e84e3ca0cf 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -605,9 +605,8 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound(
     worklist.push_back(v);
     while (!worklist.empty()) {
       Value next = worklist.pop_back_val();
-      if (visited.contains(next))
+      if (!visited.insert(next).second)
         continue;
-      visited.insert(next);
       if (llvm::is_contained(independencies, next))
         return false;
       // TODO: DominanceInfo could be used to stop the traversal early.
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 450e66f0db4e7..f288c7fc2cb77 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -2486,9 +2486,8 @@ static void eraseDeadUnrealizedCasts(
 
     // Do not visit ops multiple times. If we find a circle, no live user was
     // found on the current path.
-    if (visited.contains(op))
+    if (!visited.insert(op).second)
       return false;
-    visited.insert(op);
 
     // Visit all users.
     for (Operation *user : op->getUsers()) {
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index 7b4fac7275bfc..a3d3a92618696 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -138,6 +138,7 @@ __all__ = [
     "InsertionPoint",
     "IntegerAttr",
     "IntegerSet",
+    "IntegerSetAttr",
     "IntegerSetConstraint",
     "IntegerSetConstraintList",
     "IntegerType",
@@ -1905,6 +1906,21 @@ class IntegerSet:
     @property
     def n_symbols(self) -> int: ...
 
+class IntegerSetAttr(Attribute):
+    static_typeid: ClassVar[TypeID]
+    @staticmethod
+    def get(integer_set) -> IntegerSetAttr:
+        """
+        Gets an attribute wrapping an IntegerSet.
+        """
+    @staticmethod
+    def isinstance(other: Attribute) -> bool: ...
+    def __init__(self, cast_from_attr: Attribute) -> None: ...
+    @property
+    def type(self) -> Type: ...
+    @property
+    def typeid(self) -> TypeID: ...
+
 class IntegerSetConstraint:
     def __init__(self, *args, **kwargs) -> None: ...
     @property
diff --git a/mlir/python/mlir/ir.py b/mlir/python/mlir/ir.py
index a9ac765fe1c17..9a6ce462047ad 100644
--- a/mlir/python/mlir/ir.py
+++ b/mlir/python/mlir/ir.py
@@ -22,6 +22,11 @@ def _affineMapAttr(x, context):
     return AffineMapAttr.get(x)
 
 
+@register_attribute_builder("IntegerSetAttr")
+def _integerSetAttr(x, context):
+    return IntegerSetAttr.get(x)
+
+
 @register_attribute_builder("BoolAttr")
 def _boolAttr(x, context):
     return BoolAttr.get(x, context=context)
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 7ac49c5f02347..2fe9ba8fead17 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -1130,6 +1130,30 @@ func.func @extract_scalar_from_vec_1d_f32_scalable(%arg0: vector<[16]xf32>) -> f
 
 // -----
 
+func.func @extract_vec_1e_from_vec_1d_f32(%arg0: vector<16xf32>) -> vector<1xf32> {
+  %0 = vector.extract %arg0[15]: vector<1xf32> from vector<16xf32>
+  return %0 : vector<1xf32>
+}
+// CHECK-LABEL: @extract_vec_1e_from_vec_1d_f32(
+//  CHECK-SAME:   %[[A:.*]]: vector<16xf32>)
+//       CHECK:   %[[T0:.*]] = llvm.mlir.constant(15 : i64) : i64
+//       CHECK:   %[[T1:.*]] = llvm.extractelement %[[A]][%[[T0]] : i64] : vector<16xf32>
+//       CHECK:   %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : f32 to vector<1xf32>
+//       CHECK:   return %[[T2]] : vector<1xf32>
+
+func.func @extract_vec_1e_from_vec_1d_f32_scalable(%arg0: vector<[16]xf32>) -> vector<1xf32> {
+  %0 = vector.extract %arg0[15]: vector<1xf32> from vector<[16]xf32>
+  return %0 : vector<1xf32>
+}
+// CHECK-LABEL: @extract_vec_1e_from_vec_1d_f32_scalable(
+//  CHECK-SAME:   %[[A:.*]]: vector<[16]xf32>)
+//       CHECK:   %[[T0:.*]] = llvm.mlir.constant(15 : i64) : i64
+//       CHECK:   %[[T1:.*]] = llvm.extractelement %[[A]][%[[T0]] : i64] : vector<[16]xf32>
+//       CHECK:   %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : f32 to vector<1xf32>
+//       CHECK:   return %[[T2]] : vector<1xf32>
+
+// -----
+
 func.func @extract_scalar_from_vec_1d_index(%arg0: vector<16xindex>) -> index {
   %0 = vector.extract %arg0[15]: index from vector<16xindex>
   return %0 : index
@@ -2521,6 +2545,16 @@ func.func @transfer_write_1d_scalable_mask(%arg0: memref<1x?xf32>, %vec: vector<
 
 // -----
 
+// CHECK-LABEL: func @transfer_write_tensor
+//       CHECK:   vector.transfer_write
+func.func @transfer_write_tensor(%arg0: vector<4xf32>,%arg1: tensor<?xf32>) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = vector.transfer_write %arg0, %arg1[%c0] : vector<4xf32>, tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
 func.func @genbool_0d_f() -> vector<i1> {
   %0 = vector.constant_mask [0] : vector<i1>
   return %0 : vector<i1>
diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir
index 47a49465e8a7c..57f4ece9c9f2a 100644
--- a/mlir/test/Transforms/loop-invariant-code-motion.mlir
+++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir
@@ -1118,3 +1118,94 @@ func.func @hoist_from_scf_while(%arg0: i32, %arg1: i32) -> i32 {
   }
   return %0 : i32
 }
+
+// -----
+
+#trait = {
+  indexing_maps = [
+    affine_map<(m, n, k) -> (m, k)>,
+    affine_map<(m, n, k) -> (k, n)>,
+    affine_map<(m, n, k) -> (m, n)>
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"] 
+}
+
+// CHECK-LABEL: func @hoist_linalg_ops
+// CHECK: linalg.generic
+// CHECK: scf.for
+// CHECK-NOT: linalg.generic
+// CHECK: tensor.insert_slice
+// CHECK: scf.yield
+func.func @hoist_linalg_ops(%a : tensor<128x128xf32>, 
+                            %b : tensor<128x128xf32>, 
+                            %c: tensor<128x128xf32>,
+                            %lb : index,
+                            %ub : index,
+                            %step : index,
+                            %output : tensor<?x128xf32>) -> tensor<?x128xf32> {
+  %final = 
+  scf.for %i = %lb to %ub step %step iter_args(%acc = %output) 
+                                            -> tensor<?x128xf32> {
+    %compute = linalg.generic #trait
+               ins(%a, %b : tensor<128x128xf32>, tensor<128x128xf32>) 
+               outs(%c : tensor<128x128xf32>) {
+    ^bb0(%in : f32, %in2 : f32, %in3 : f32):
+      %mul = arith.mulf %in, %in2 : f32
+      %add = arith.addf %mul, %in3 : f32
+      linalg.yield %in3 : f32
+    } -> tensor<128x128xf32>
+
+    %newacc = tensor.insert_slice %compute into 
+                                  %output[%i, 0][128, 128][1, 1] 
+                                  : tensor<128x128xf32> into tensor<?x128xf32>
+    scf.yield %newacc : tensor<?x128xf32>
+  }
+
+  func.return %final : tensor<?x128xf32>
+}
+
+// -----
+
+#trait = {
+  indexing_maps = [
+    affine_map<(m, n, k) -> (m, k)>,
+    affine_map<(m, n, k) -> (k, n)>,
+    affine_map<(m, n, k) -> (m, n)>
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"] 
+}
+
+// CHECK-LABEL: func @hoist_linalg_ops_div_by_zero
+// CHECK-NOT: linalg.generic
+// CHECK: scf.for
+// CHECK: linalg.generic
+// CHECK: tensor.insert_slice
+// CHECK: scf.yield
+func.func @hoist_linalg_ops_div_by_zero(%a : tensor<128x128xi32>, 
+                            %b : tensor<128x128xi32>, 
+                            %c: tensor<128x128xi32>,
+                            %lb : index,
+                            %ub : index,
+                            %step : index,
+                            %output : tensor<?x128xi32>) -> tensor<?x128xi32> {
+  %cst0 = arith.constant 0 : i32
+  %final = 
+  scf.for %i = %lb to %ub step %step iter_args(%acc = %output) 
+                                            -> tensor<?x128xi32> {
+    %compute = linalg.generic #trait
+               ins(%a, %b : tensor<128x128xi32>, tensor<128x128xi32>) 
+               outs(%c : tensor<128x128xi32>) {
+    ^bb0(%in : i32, %in2 : i32, %in3 : i32):
+      %div = arith.divui %in, %in2 : i32
+      %add = arith.addi %div, %in3 : i32
+      linalg.yield %in3 : i32
+    } -> tensor<128x128xi32>
+
+    %newacc = tensor.insert_slice %compute into 
+                                  %output[%i, 0][128, 128][1, 1] 
+                                  : tensor<128x128xi32> into tensor<?x128xi32>
+    scf.yield %newacc : tensor<?x128xi32>
+  }
+
+  func.return %final : tensor<?x128xi32>
+}
diff --git a/mlir/test/mlir-tblgen/openmp-clause-ops.td b/mlir/test/mlir-tblgen/openmp-clause-ops.td
new file mode 100644
index 0000000000000..cee3f2a693bf8
--- /dev/null
+++ b/mlir/test/mlir-tblgen/openmp-clause-ops.td
@@ -0,0 +1,86 @@
+// Tablegen tests for the automatic generation of OpenMP clause operand
+// structure definitions.
+
+// Run tablegen to generate OmpCommon.td in temp directory first.
+// RUN: mkdir -p %t/mlir/Dialect/OpenMP
+// RUN: mlir-tblgen --gen-directive-decl --directives-dialect=OpenMP \
+// RUN:   %S/../../../llvm/include/llvm/Frontend/OpenMP/OMP.td \
+// RUN:   -I %S/../../../llvm/include > %t/mlir/Dialect/OpenMP/OmpCommon.td
+
+// RUN: mlir-tblgen -gen-openmp-clause-ops -I %S/../../include -I %t %s 2>&1 | FileCheck %s
+
+include "mlir/Dialect/OpenMP/OpenMPOpBase.td"
+
+
+def OpenMP_MyFirstClause : OpenMP_Clause<
+    /*isRequired=*/false, /*skipTraits=*/false, /*skipArguments=*/false,
+    /*skipAssemblyFormat=*/false, /*skipDescription=*/false,
+    /*skipExtraClassDeclaration=*/false> {
+  let arguments = (ins
+    // Simple attributes
+    I32Attr:$int_attr,
+    TypeAttr:$type_attr,
+    DeclareTargetAttr:$omp_attr,
+
+    // Array attributes
+    F32ArrayAttr:$float_array_attr,
+    StrArrayAttr:$str_array_attr,
+    AnyIntElementsAttr:$anyint_elems_attr,
+    RankedF32ElementsAttr<[3, 4, 5]>:$float_nd_elems_attr,
+
+    // Optional attributes
+    OptionalAttr<BoolAttr>:$opt_bool_attr,
+    OptionalAttr<I64ArrayAttr>:$opt_int_array_attr,
+    OptionalAttr<DenseI8ArrayAttr>:$opt_int_elems_attr,
+
+    // Multi-level composition
+    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$complex_opt_int_attr,
+
+    // ElementsAttrBase-related edge cases.
+    // CHECK: warning: could not infer array-like attribute element type for argument 'elements_attr', will use bare `storageType`
+    ElementsAttr:$elements_attr,
+    // CHECK: warning: could not infer array-like attribute element type for argument 'string_elements_attr', will use bare `storageType`
+    StringElementsAttr:$string_elements_attr
+  );
+}
+// CHECK:      struct MyFirstClauseOps {
+// CHECK-NEXT:   ::mlir::IntegerAttr intAttr;
+// CHECK-NEXT:   ::mlir::TypeAttr typeAttr;
+// CHECK-NEXT:   ::mlir::omp::DeclareTargetAttr ompAttr;
+
+// CHECK-NEXT:   ::llvm::SmallVector<::mlir::Attribute> floatArrayAttr;
+// CHECK-NEXT:   ::llvm::SmallVector<::mlir::Attribute> strArrayAttr;
+// CHECK-NEXT:   ::llvm::SmallVector<::llvm::APInt> anyintElemsAttr;
+// CHECK-NEXT:   ::llvm::SmallVector<::llvm::APFloat> floatNdElemsAttr;
+
+// CHECK-NEXT:   ::mlir::BoolAttr optBoolAttr;
+// CHECK-NEXT:   ::llvm::SmallVector<::mlir::Attribute> optIntArrayAttr;
+// CHECK-NEXT:   ::llvm::SmallVector<int8_t> optIntElemsAttr;
+
+// CHECK-NEXT:   ::mlir::IntegerAttr complexOptIntAttr;
+
+// CHECK-NEXT:   ::mlir::ElementsAttr elementsAttr;
+// CHECK-NEXT:   ::mlir::DenseElementsAttr stringElementsAttr;
+// CHECK-NEXT: }
+
+def OpenMP_MySecondClause : OpenMP_Clause<
+    /*isRequired=*/false, /*skipTraits=*/false, /*skipArguments=*/false,
+    /*skipAssemblyFormat=*/false, /*skipDescription=*/false,
+    /*skipExtraClassDeclaration=*/false> {
+  let arguments = (ins
+    I32:$int_val,
+    Optional<AnyType>:$opt_any_val,
+    Variadic<Index>:$variadic_index_val
+  );
+}
+// CHECK:      struct MySecondClauseOps {
+// CHECK-NEXT:   ::mlir::Value intVal;
+// CHECK-NEXT:   ::mlir::Value optAnyVal;
+// CHECK-NEXT:   ::llvm::SmallVector<::mlir::Value> variadicIndexVal;
+// CHECK-NEXT: }
+
+def OpenMP_MyFirstOp : OpenMP_Op<"op", clauses=[OpenMP_MyFirstClause]>;
+// CHECK: using MyFirstOperands = detail::Clauses<MyFirstClauseOps>;
+
+def OpenMP_MySecondOp : OpenMP_Op<"op", clauses=[OpenMP_MyFirstClause, OpenMP_MySecondClause]>;
+// CHECK: using MySecondOperands = detail::Clauses<MyFirstClauseOps, MySecondClauseOps>;
diff --git a/mlir/test/python/ir/attributes.py b/mlir/test/python/ir/attributes.py
index 4b475db634645..00c3e1b4decdb 100644
--- a/mlir/test/python/ir/attributes.py
+++ b/mlir/test/python/ir/attributes.py
@@ -162,6 +162,24 @@ def testAffineMapAttr():
         assert attr_built == attr_parsed
 
 
+# CHECK-LABEL: TEST: testIntegerSetAttr
+@run
+def testIntegerSetAttr():
+    with Context() as ctx:
+        d0 = AffineDimExpr.get(0)
+        d1 = AffineDimExpr.get(1)
+        s0 = AffineSymbolExpr.get(0)
+        c42 = AffineConstantExpr.get(42)
+        set0 = IntegerSet.get(2, 1, [d0 - d1, s0 - c42], [True, False])
+
+        # CHECK: affine_set<(d0, d1)[s0] : (d0 - d1 == 0, s0 - 42 >= 0)>
+        attr_built = IntegerSetAttr.get(set0)
+        print(str(attr_built))
+
+        attr_parsed = Attribute.parse(str(attr_built))
+        assert attr_built == attr_parsed
+
+
 # CHECK-LABEL: TEST: testFloatAttr
 @run
 def testFloatAttr():
diff --git a/mlir/test/tblgen-to-irdl/CMathDialect.td b/mlir/test/tblgen-to-irdl/CMathDialect.td
index 5b9e756727cb3..454543e074c48 100644
--- a/mlir/test/tblgen-to-irdl/CMathDialect.td
+++ b/mlir/test/tblgen-to-irdl/CMathDialect.td
@@ -25,7 +25,6 @@ def CMath_ComplexType : CMath_Type<"ComplexType", "complex"> {
 
 // CHECK:      irdl.operation @identity {
 // CHECK-NEXT:   %0 = irdl.base "!cmath.complex"
-// CHECK-NEXT:   irdl.operands()
 // CHECK-NEXT:   irdl.results(%0)
 // CHECK-NEXT: }
 def CMath_IdentityOp : CMath_Op<"identity"> {
diff --git a/mlir/test/tblgen-to-irdl/TestDialect.td b/mlir/test/tblgen-to-irdl/TestDialect.td
index fc40da527db00..2622c81776076 100644
--- a/mlir/test/tblgen-to-irdl/TestDialect.td
+++ b/mlir/test/tblgen-to-irdl/TestDialect.td
@@ -28,9 +28,8 @@ def Test_AndOp : Test_Op<"and"> {
 // CHECK-LABEL: irdl.operation @and {
 // CHECK-NEXT:    %[[v0:[^ ]*]] = irdl.base "!test.singleton_a"
 // CHECK-NEXT:    %[[v1:[^ ]*]] = irdl.any
-// CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.all_of(%[[v0]], %[[v1]]) 
+// CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.all_of(%[[v0]], %[[v1]])
 // CHECK-NEXT:    irdl.operands(%[[v2]])
-// CHECK-NEXT:    irdl.results()
 // CHECK-NEXT:  }
 
 
@@ -41,9 +40,39 @@ def Test_AnyOp : Test_Op<"any"> {
 // CHECK-LABEL: irdl.operation @any {
 // CHECK-NEXT:    %[[v0:[^ ]*]] = irdl.any
 // CHECK-NEXT:    irdl.operands(%[[v0]])
-// CHECK-NEXT:    irdl.results()
 // CHECK-NEXT:  }
 
+// Check confined types are converted correctly.
+def Test_ConfinedOp : Test_Op<"confined"> {
+  let arguments = (ins ConfinedType<AnyType, [CPred<"::llvm::isa<::mlir::TensorType>($_self)">]>:$tensor,
+                       ConfinedType<AnyType, [And<[CPred<"::llvm::isa<::mlir::VectorType>($_self)">
+                                             , CPred<"::llvm::cast<::mlir::VectorType>($_self).getRank() > 0">]>]>:$vector);
+}
+// CHECK-LABEL: irdl.operation @confined {
+// CHECK-NEXT:    %[[v0:[^ ]*]] = irdl.any
+// CHECK-NEXT:    %[[v1:[^ ]*]] = irdl.c_pred "(::llvm::isa<::mlir::TensorType>($_self))"
+// CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.all_of(%[[v0]], %[[v1]])
+// CHECK-NEXT:    %[[v3:[^ ]*]] = irdl.any
+// CHECK-NEXT:    %[[v4:[^ ]*]] = irdl.c_pred "(::llvm::isa<::mlir::VectorType>($_self))"
+// CHECK-NEXT:    %[[v5:[^ ]*]] = irdl.c_pred "(::llvm::cast<::mlir::VectorType>($_self).getRank() > 0)"
+// CHECK-NEXT:    %[[v6:[^ ]*]] = irdl.all_of(%[[v4]], %[[v5]])
+// CHECK-NEXT:    %[[v7:[^ ]*]] = irdl.all_of(%[[v3]], %[[v6]])
+// CHECK-NEXT:    irdl.operands(%[[v2]], %[[v7]])
+// CHECK-NEXT:  }
+
+// Check generic integer types are converted correctly.
+def Test_Integers : Test_Op<"integers"> {
+  let arguments = (ins AnyI8:$any_int,
+                       AnyInteger:$any_integer);
+}
+// CHECK-LABEL: irdl.operation @integers {
+// CHECK-NEXT:    %[[v0:[^ ]*]] = irdl.is i8
+// CHECK-NEXT:    %[[v1:[^ ]*]] = irdl.is si8
+// CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.is ui8
+// CHECK-NEXT:    %[[v3:[^ ]*]] = irdl.any_of(%[[v0]], %[[v1]], %[[v2]])
+// CHECK-NEXT:    %[[v4:[^ ]*]] = irdl.base "!builtin.integer"
+// CHECK-NEXT:    irdl.operands(%[[v3]], %[[v4]])
+// CHECK-NEXT:  }
 
 // Check that AnyTypeOf is converted correctly.
 def Test_OrOp : Test_Op<"or"> {
@@ -53,11 +82,30 @@ def Test_OrOp : Test_Op<"or"> {
 // CHECK-NEXT:    %[[v0:[^ ]*]] = irdl.base "!test.singleton_a"
 // CHECK-NEXT:    %[[v1:[^ ]*]] = irdl.base "!test.singleton_b"
 // CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.base "!test.singleton_c"
-// CHECK-NEXT:    %[[v3:[^ ]*]] = irdl.any_of(%[[v0]], %[[v1]], %[[v2]]) 
+// CHECK-NEXT:    %[[v3:[^ ]*]] = irdl.any_of(%[[v0]], %[[v1]], %[[v2]])
 // CHECK-NEXT:    irdl.operands(%[[v3]])
-// CHECK-NEXT:    irdl.results()
 // CHECK-NEXT:  }
 
+// Check that various types are converted correctly.
+def Test_TypesOp : Test_Op<"types"> {
+  let arguments = (ins I32:$a,
+                       SI64:$b,
+                       UI8:$c,
+                       Index:$d,
+                       F32:$e,
+                       NoneType:$f,
+                       Complex<F8E4M3FN>);
+}
+// CHECK-LABEL: irdl.operation @types {
+// CHECK-NEXT:    %{{.*}} = irdl.is i32
+// CHECK-NEXT:    %{{.*}} = irdl.is si64
+// CHECK-NEXT:    %{{.*}} = irdl.is ui8
+// CHECK-NEXT:    %{{.*}} = irdl.is index
+// CHECK-NEXT:    %{{.*}} = irdl.is f32
+// CHECK-NEXT:    %{{.*}} = irdl.is none
+// CHECK-NEXT:    %{{.*}} = irdl.is complex<f8E4M3FN>
+// CHECK-NEXT:    irdl.operands({{.*}})
+// CHECK-NEXT:  }
 
 // Check that variadics and optionals are converted correctly.
 def Test_VariadicityOp : Test_Op<"variadicity"> {
@@ -70,5 +118,4 @@ def Test_VariadicityOp : Test_Op<"variadicity"> {
 // CHECK-NEXT:    %[[v1:[^ ]*]] = irdl.base "!test.singleton_b"
 // CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.base "!test.singleton_c"
 // CHECK-NEXT:    irdl.operands(variadic %[[v0]], optional %[[v1]], %[[v2]])
-// CHECK-NEXT:    irdl.results()
 // CHECK-NEXT:  }
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
index a00f12661f712..7d42c03469dc9 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
@@ -656,7 +656,7 @@ ArrayAttr {0}::getIndexingMaps() {{
 }
 )FMT";
 
-// Implementations of fold and getEffects.
+// Implementations of fold, getEffects and getSpeculatability.
 // Parameters:
 // {0}: Class name
 const char structuredOpFoldersFormat[] = R"FMT(
@@ -669,6 +669,9 @@ void {0}::getEffects(SmallVectorImpl<
       if (hasPureTensorSemantics()) return;
       getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
+Speculation::Speculatability {0}::getSpeculatability() {{
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
 )FMT";
 
 // Implementation of parse/print.
diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
index 1545821263788..23368c56bee8c 100644
--- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp
+++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
@@ -12,11 +12,54 @@
 
 #include "mlir/TableGen/GenInfo.h"
 
+#include "mlir/TableGen/CodeGenHelpers.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/FormatAdapters.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 
 using namespace llvm;
 
+/// The code block defining the base mixin class for combining clause operand
+/// structures.
+static const char *const baseMixinClass = R"(
+namespace detail {
+template <typename... Mixins>
+struct Clauses : public Mixins... {};
+} // namespace detail
+)";
+
+/// The code block defining operation argument structures.
+static const char *const operationArgStruct = R"(
+using {0}Operands = detail::Clauses<{1}>;
+)";
+
+/// Remove multiple optional prefixes and suffixes from \c str.
+///
+/// Prefixes and suffixes are attempted to be removed once in the order they
+/// appear in the \c prefixes and \c suffixes arguments. All prefixes are
+/// processed before suffixes are. This means it will behave as shown in the
+/// following example:
+///   - str: "PrePreNameSuf1Suf2"
+///   - prefixes: ["Pre"]
+///   - suffixes: ["Suf1", "Suf2"]
+///   - return: "PreNameSuf1"
+static StringRef stripPrefixAndSuffix(StringRef str,
+                                      llvm::ArrayRef<StringRef> prefixes,
+                                      llvm::ArrayRef<StringRef> suffixes) {
+  for (StringRef prefix : prefixes)
+    if (str.starts_with(prefix))
+      str = str.drop_front(prefix.size());
+
+  for (StringRef suffix : suffixes)
+    if (str.ends_with(suffix))
+      str = str.drop_back(suffix.size());
+
+  return str;
+}
+
 /// Obtain the name of the OpenMP clause a given record inheriting
 /// `OpenMP_Clause` refers to.
 ///
@@ -53,19 +96,8 @@ static StringRef extractOmpClauseName(const Record *clause) {
   assert(!clauseClassName.empty() && "clause name must be found");
 
   // Keep only the OpenMP clause name itself for reporting purposes.
-  StringRef prefix = "OpenMP_";
-  StringRef suffixes[] = {"Skip", "Clause"};
-
-  if (clauseClassName.starts_with(prefix))
-    clauseClassName = clauseClassName.substr(prefix.size());
-
-  for (StringRef suffix : suffixes) {
-    if (clauseClassName.ends_with(suffix))
-      clauseClassName =
-          clauseClassName.substr(0, clauseClassName.size() - suffix.size());
-  }
-
-  return clauseClassName;
+  return stripPrefixAndSuffix(clauseClassName, /*prefixes=*/{"OpenMP_"},
+                              /*suffixes=*/{"Skip", "Clause"});
 }
 
 /// Check that the given argument, identified by its name and initialization
@@ -148,6 +180,139 @@ static void verifyClause(const Record *op, const Record *clause) {
             "or explicitly skipping this field.");
 }
 
+/// Translate the type of an OpenMP clause's argument to its corresponding
+/// representation for clause operand structures.
+///
+/// All kinds of values are represented as `mlir::Value` fields, whereas
+/// attributes are represented based on their `storageType`.
+///
+/// \param[in] name The name of the argument.
+/// \param[in] init The `DefInit` object representing the argument.
+/// \param[out] nest Number of levels of array nesting associated with the
+///                  type. Must be initially set to 0.
+/// \param[out] rank Rank (number of dimensions, if an array type) of the base
+///                  type. Must be initially set to 1.
+///
+/// \return the name of the base type to represent elements of the argument
+///         type.
+static StringRef translateArgumentType(ArrayRef<SMLoc> loc, StringInit *name,
+                                       Init *init, int &nest, int &rank) {
+  Record *def = cast<DefInit>(init)->getDef();
+
+  llvm::StringSet<> superClasses;
+  for (auto [sc, _] : def->getSuperClasses())
+    superClasses.insert(sc->getNameInitAsString());
+
+  // Handle wrapper-style superclasses.
+  if (superClasses.contains("OptionalAttr"))
+    return translateArgumentType(
+        loc, name, def->getValue("baseAttr")->getValue(), nest, rank);
+
+  if (superClasses.contains("TypedArrayAttrBase"))
+    return translateArgumentType(
+        loc, name, def->getValue("elementAttr")->getValue(), ++nest, rank);
+
+  // Handle ElementsAttrBase superclasses.
+  if (superClasses.contains("ElementsAttrBase")) {
+    // TODO: Obtain the rank from ranked types.
+    ++nest;
+
+    if (superClasses.contains("IntElementsAttrBase"))
+      return "::llvm::APInt";
+    if (superClasses.contains("FloatElementsAttr") ||
+        superClasses.contains("RankedFloatElementsAttr"))
+      return "::llvm::APFloat";
+    if (superClasses.contains("DenseArrayAttrBase"))
+      return stripPrefixAndSuffix(def->getValueAsString("returnType"),
+                                  {"::llvm::ArrayRef<"}, {">"});
+
+    // Decrease the nesting depth in the case where the base type cannot be
+    // inferred, so that the bare storageType is used instead of a vector.
+    --nest;
+    PrintWarning(
+        loc,
+        "could not infer array-like attribute element type for argument '" +
+            name->getAsUnquotedString() + "', will use bare `storageType`");
+  }
+
+  // Handle simple attribute and value types.
+  [[maybe_unused]] bool isAttr = superClasses.contains("Attr");
+  bool isValue = superClasses.contains("TypeConstraint");
+  if (superClasses.contains("Variadic"))
+    ++nest;
+
+  if (isValue) {
+    assert(!isAttr &&
+           "argument can't be simultaneously a value and an attribute");
+    return "::mlir::Value";
+  }
+
+  assert(isAttr && "argument must be an attribute if it's not a value");
+  return nest > 0 ? "::mlir::Attribute"
+                  : def->getValueAsString("storageType").trim();
+}
+
+/// Generate the structure that represents the arguments of the given \c clause
+/// record of type \c OpenMP_Clause.
+///
+/// It will contain a field for each argument, using the same name translated to
+/// camel case and the corresponding base type as returned by
+/// translateArgumentType() optionally wrapped in one or more llvm::SmallVector.
+///
+/// An additional field containing a tuple of integers to hold the size of each
+/// dimension will also be created for multi-rank types. This is not yet
+/// supported.
+static void genClauseOpsStruct(const Record *clause, raw_ostream &os) {
+  if (clause->isAnonymous())
+    return;
+
+  StringRef clauseName = extractOmpClauseName(clause);
+  os << "struct " << clauseName << "ClauseOps {\n";
+
+  DagInit *arguments = clause->getValueAsDag("arguments");
+  for (auto [name, arg] :
+       zip_equal(arguments->getArgNames(), arguments->getArgs())) {
+    int nest = 0, rank = 1;
+    StringRef baseType =
+        translateArgumentType(clause->getLoc(), name, arg, nest, rank);
+    std::string fieldName =
+        convertToCamelFromSnakeCase(name->getAsUnquotedString(),
+                                    /*capitalizeFirst=*/false);
+
+    os << formatv("  {0}{1}{2} {3};\n",
+                  fmt_repeat("::llvm::SmallVector<", nest), baseType,
+                  fmt_repeat(">", nest), fieldName);
+
+    if (rank > 1) {
+      assert(nest >= 1 && "must be nested if it's a ranked type");
+      os << formatv("  {0}::std::tuple<{1}int>{2} {3}Dims;\n",
+                    fmt_repeat("::llvm::SmallVector<", nest - 1),
+                    fmt_repeat("int, ", rank - 1), fmt_repeat(">", nest - 1),
+                    fieldName);
+    }
+  }
+
+  os << "};\n";
+}
+
+/// Generate the structure that represents the clause-related arguments of the
+/// given \c op record of type \c OpenMP_Op.
+///
+/// This structure will be defined in terms of the clause operand structures
+/// associated to the clauses of the operation.
+static void genOperandsDef(const Record *op, raw_ostream &os) {
+  if (op->isAnonymous())
+    return;
+
+  SmallVector<std::string> clauseNames;
+  for (Record *clause : op->getValueAsListOfDefs("clauseList"))
+    clauseNames.push_back((extractOmpClauseName(clause) + "ClauseOps").str());
+
+  StringRef opName = stripPrefixAndSuffix(
+      op->getName(), /*prefixes=*/{"OpenMP_"}, /*suffixes=*/{"Op"});
+  os << formatv(operationArgStruct, opName, join(clauseNames, ", "));
+}
+
 /// Verify that all properties of `OpenMP_Clause`s of records deriving from
 /// `OpenMP_Op`s have been inherited by the latter.
 static bool verifyDecls(const RecordKeeper &recordKeeper, raw_ostream &) {
@@ -159,8 +324,32 @@ static bool verifyDecls(const RecordKeeper &recordKeeper, raw_ostream &) {
   return false;
 }
 
+/// Generate structures to represent clause-related operands, based on existing
+/// `OpenMP_Clause` definitions and aggregate them into operation-specific
+/// structures according to the `clauses` argument of each definition deriving
+/// from `OpenMP_Op`.
+static bool genClauseOps(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  mlir::tblgen::NamespaceEmitter ns(os, "mlir::omp");
+  for (const Record *clause :
+       recordKeeper.getAllDerivedDefinitions("OpenMP_Clause"))
+    genClauseOpsStruct(clause, os);
+
+  // Produce base mixin class.
+  os << baseMixinClass;
+
+  for (const Record *op : recordKeeper.getAllDerivedDefinitions("OpenMP_Op"))
+    genOperandsDef(op, os);
+
+  return false;
+}
+
 // Registers the generator to mlir-tblgen.
 static mlir::GenRegistration
     verifyOpenmpOps("verify-openmp-ops",
                     "Verify OpenMP operations (produce no output file)",
                     verifyDecls);
+
+static mlir::GenRegistration
+    genOpenmpClauseOps("gen-openmp-clause-ops",
+                       "Generate OpenMP clause operand structures",
+                       genClauseOps);
diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
index 4a13a00335f65..dd0d98de496e8 100644
--- a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
+++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
@@ -39,6 +39,131 @@ llvm::cl::opt<std::string>
     selectedDialect("dialect", llvm::cl::desc("The dialect to gen for"),
                     llvm::cl::cat(dialectGenCat), llvm::cl::Required);
 
+Value createPredicate(OpBuilder &builder, tblgen::Pred pred) {
+  MLIRContext *ctx = builder.getContext();
+
+  if (pred.isCombined()) {
+    auto combiner = pred.getDef().getValueAsDef("kind")->getName();
+    if (combiner == "PredCombinerAnd" || combiner == "PredCombinerOr") {
+      std::vector<Value> constraints;
+      for (auto *child : pred.getDef().getValueAsListOfDefs("children")) {
+        constraints.push_back(createPredicate(builder, tblgen::Pred(child)));
+      }
+      if (combiner == "PredCombinerAnd") {
+        auto op =
+            builder.create<irdl::AllOfOp>(UnknownLoc::get(ctx), constraints);
+        return op.getOutput();
+      }
+      auto op =
+          builder.create<irdl::AnyOfOp>(UnknownLoc::get(ctx), constraints);
+      return op.getOutput();
+    }
+  }
+
+  std::string condition = pred.getCondition();
+  // Build a CPredOp to match the C constraint built.
+  irdl::CPredOp op = builder.create<irdl::CPredOp>(
+      UnknownLoc::get(ctx), StringAttr::get(ctx, condition));
+  return op;
+}
+
+Value typeToConstraint(OpBuilder &builder, Type type) {
+  MLIRContext *ctx = builder.getContext();
+  auto op =
+      builder.create<irdl::IsOp>(UnknownLoc::get(ctx), TypeAttr::get(type));
+  return op.getOutput();
+}
+
+std::optional<Type> recordToType(MLIRContext *ctx, const Record &predRec) {
+
+  if (predRec.isSubClassOf("I")) {
+    auto width = predRec.getValueAsInt("bitwidth");
+    return IntegerType::get(ctx, width, IntegerType::Signless);
+  }
+
+  if (predRec.isSubClassOf("SI")) {
+    auto width = predRec.getValueAsInt("bitwidth");
+    return IntegerType::get(ctx, width, IntegerType::Signed);
+  }
+
+  if (predRec.isSubClassOf("UI")) {
+    auto width = predRec.getValueAsInt("bitwidth");
+    return IntegerType::get(ctx, width, IntegerType::Unsigned);
+  }
+
+  // Index type
+  if (predRec.getName() == "Index") {
+    return IndexType::get(ctx);
+  }
+
+  // Float types
+  if (predRec.isSubClassOf("F")) {
+    auto width = predRec.getValueAsInt("bitwidth");
+    switch (width) {
+    case 16:
+      return FloatType::getF16(ctx);
+    case 32:
+      return FloatType::getF32(ctx);
+    case 64:
+      return FloatType::getF64(ctx);
+    case 80:
+      return FloatType::getF80(ctx);
+    case 128:
+      return FloatType::getF128(ctx);
+    }
+  }
+
+  if (predRec.getName() == "NoneType") {
+    return NoneType::get(ctx);
+  }
+
+  if (predRec.getName() == "BF16") {
+    return FloatType::getBF16(ctx);
+  }
+
+  if (predRec.getName() == "TF32") {
+    return FloatType::getTF32(ctx);
+  }
+
+  if (predRec.getName() == "F8E4M3FN") {
+    return FloatType::getFloat8E4M3FN(ctx);
+  }
+
+  if (predRec.getName() == "F8E5M2") {
+    return FloatType::getFloat8E5M2(ctx);
+  }
+
+  if (predRec.getName() == "F8E4M3") {
+    return FloatType::getFloat8E4M3(ctx);
+  }
+
+  if (predRec.getName() == "F8E4M3FNUZ") {
+    return FloatType::getFloat8E4M3FNUZ(ctx);
+  }
+
+  if (predRec.getName() == "F8E4M3B11FNUZ") {
+    return FloatType::getFloat8E4M3B11FNUZ(ctx);
+  }
+
+  if (predRec.getName() == "F8E5M2FNUZ") {
+    return FloatType::getFloat8E5M2FNUZ(ctx);
+  }
+
+  if (predRec.getName() == "F8E3M4") {
+    return FloatType::getFloat8E3M4(ctx);
+  }
+
+  if (predRec.isSubClassOf("Complex")) {
+    const Record *elementRec = predRec.getValueAsDef("elementType");
+    auto elementType = recordToType(ctx, *elementRec);
+    if (elementType.has_value()) {
+      return ComplexType::get(elementType.value());
+    }
+  }
+
+  return std::nullopt;
+}
+
 Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) {
   MLIRContext *ctx = builder.getContext();
   const Record &predRec = constraint.getDef();
@@ -78,11 +203,45 @@ Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) {
     return op.getOutput();
   }
 
-  std::string condition = constraint.getPredicate().getCondition();
-  // Build a CPredOp to match the C constraint built.
-  irdl::CPredOp op = builder.create<irdl::CPredOp>(
-      UnknownLoc::get(ctx), StringAttr::get(ctx, condition));
-  return op;
+  // Integer types
+  if (predRec.getName() == "AnyInteger") {
+    auto op = builder.create<irdl::BaseOp>(
+        UnknownLoc::get(ctx), StringAttr::get(ctx, "!builtin.integer"));
+    return op.getOutput();
+  }
+
+  if (predRec.isSubClassOf("AnyI")) {
+    auto width = predRec.getValueAsInt("bitwidth");
+    std::vector<Value> types = {
+        typeToConstraint(builder,
+                         IntegerType::get(ctx, width, IntegerType::Signless)),
+        typeToConstraint(builder,
+                         IntegerType::get(ctx, width, IntegerType::Signed)),
+        typeToConstraint(builder,
+                         IntegerType::get(ctx, width, IntegerType::Unsigned))};
+    auto op = builder.create<irdl::AnyOfOp>(UnknownLoc::get(ctx), types);
+    return op.getOutput();
+  }
+
+  auto type = recordToType(ctx, predRec);
+
+  if (type.has_value()) {
+    return typeToConstraint(builder, type.value());
+  }
+
+  // Confined type
+  if (predRec.isSubClassOf("ConfinedType")) {
+    std::vector<Value> constraints;
+    constraints.push_back(createConstraint(
+        builder, tblgen::Constraint(predRec.getValueAsDef("baseType"))));
+    for (Record *child : predRec.getValueAsListOfDefs("predicateList")) {
+      constraints.push_back(createPredicate(builder, tblgen::Pred(child)));
+    }
+    auto op = builder.create<irdl::AllOfOp>(UnknownLoc::get(ctx), constraints);
+    return op.getOutput();
+  }
+
+  return createPredicate(builder, constraint.getPredicate());
 }
 
 /// Returns the name of the operation without the dialect prefix.
@@ -131,10 +290,12 @@ irdl::OperationOp createIRDLOperation(OpBuilder &builder,
   auto [results, resultVariadicity] = getValues(tblgenOp.getResults());
 
   // Create the operands and results operations.
-  consBuilder.create<irdl::OperandsOp>(UnknownLoc::get(ctx), operands,
-                                       operandVariadicity);
-  consBuilder.create<irdl::ResultsOp>(UnknownLoc::get(ctx), results,
-                                      resultVariadicity);
+  if (!operands.empty())
+    consBuilder.create<irdl::OperandsOp>(UnknownLoc::get(ctx), operands,
+                                         operandVariadicity);
+  if (!results.empty())
+    consBuilder.create<irdl::ResultsOp>(UnknownLoc::get(ctx), results,
+                                        resultVariadicity);
 
   return op;
 }
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 4df7954ea3440..2af3fb40507ed 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -3690,22 +3690,41 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "llvm-debuginfod-find",
+gentbl(
+    name = "DebugInfodFindOptsTableGen",
+    strip_include_prefix = "tools/llvm-debuginfod-find",
+    tbl_outs = [(
+        "-gen-opt-parser-defs",
+        "tools/llvm-debuginfod-find/Opts.inc",
+    )],
+    tblgen = ":llvm-tblgen",
+    td_file = "tools/llvm-debuginfod-find/Opts.td",
+    td_srcs = ["include/llvm/Option/OptParser.td"],
+)
+
+cc_library(
+    name = "llvm-debuginfod-find-lib",
     srcs = glob([
         "tools/llvm-debuginfod-find/*.cpp",
     ]),
     copts = llvm_copts,
-    stamp = 0,
     deps = [
         ":BitReader",
         ":Core",
+        ":DebugInfodFindOptsTableGen",
         ":Debuginfod",
+        ":Option",
         ":Support",
         ":Symbolize",
     ],
 )
 
+llvm_driver_cc_binary(
+    name = "llvm-debuginfod-find",
+    stamp = 0,
+    deps = [":llvm-debuginfod-find-lib"],
+)
+
 cc_binary(
     name = "llvm-dis",
     srcs = glob([
diff --git a/utils/bazel/llvm-project-overlay/llvm/driver.bzl b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
index b3d3b2eed9f06..66e8af7db7d0e 100644
--- a/utils/bazel/llvm-project-overlay/llvm/driver.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
@@ -16,6 +16,7 @@ _TOOLS = {
     "llvm-ar": "//llvm:llvm-ar-lib",
     "llvm-cgdata": "//llvm:llvm-cgdata-lib",
     "llvm-cxxfilt": "//llvm:llvm-cxxfilt-lib",
+    "llvm-debuginfod-find": "//llvm:llvm-debuginfod-find-lib",
     "llvm-dwp": "//llvm:llvm-dwp-lib",
     "llvm-gsymutil": "//llvm:llvm-gsymutil-lib",
     "llvm-ifs": "//llvm:llvm-ifs-lib",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 5ee0ee5108276..c931898ed98e3 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10516,6 +10516,10 @@ gentbl_cc_library(
             ["-gen-op-doc"],
             "g3doc/Dialects/OpenMP/OpenMPOps.md",
         ),
+        (
+            ["-gen-openmp-clause-ops"],
+            "include/mlir/Dialect/OpenMP/OpenMPClauseOps.h.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/OpenMP/OpenMPOps.td",
@@ -11780,6 +11784,7 @@ cc_library(
         ":MaskableOpInterface",
         ":MemRefDialect",
         ":Pass",
+        ":TensorDialect",
         ":ToLLVMIRTranslation",
         ":TransformUtils",
         ":VectorDialect",