From 61372fc5db9b14fd612be8a58a76edd7f0ee38aa Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Tue, 10 Sep 2024 12:52:02 -0700
Subject: [PATCH 001/114] [HLSL] Warn on duplicate is_rov attribute; remove
 unnecessary parentheses (#107973)

We should issue a warning whenever a duplicate resource type attribute
is found. Currently we do that only for `resource_class`. This PR fixes
that by checking for duplicate `is_rov` attributes as well.

Also removes unnecessary parenthesis on `is_rov`.
---
 clang/lib/AST/TypePrinter.cpp                         | 2 +-
 clang/lib/Sema/SemaHLSL.cpp                           | 4 ++++
 clang/lib/Sema/SemaType.cpp                           | 6 +++++-
 clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl           | 6 +++---
 clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl     | 8 ++++----
 clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl | 2 +-
 6 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index add6a5d10d61f..be627a6242eb4 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -2077,7 +2077,7 @@ void TypePrinter::printHLSLAttributedResourceAfter(
      << HLSLResourceClassAttr::ConvertResourceClassToStr(Attrs.ResourceClass)
      << ")]]";
   if (Attrs.IsROV)
-    OS << " [[hlsl::is_rov()]]";
+    OS << " [[hlsl::is_rov]]";
 }
 
 void TypePrinter::printObjCInterfaceBefore(const ObjCInterfaceType *T,
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index f2158226e6ca7..4e44813fe515c 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -592,6 +592,10 @@ bool clang::CreateHLSLAttributedResourceType(Sema &S, QualType Wrapped,
       break;
     }
     case attr::HLSLROV:
+      if (ResAttrs.IsROV) {
+        S.Diag(A->getLocation(), diag::warn_duplicate_attribute_exact) << A;
+        return false;
+      }
       ResAttrs.IsROV = true;
       break;
     default:
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 520dce870b7b7..e627fee51b66b 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -8844,7 +8844,11 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
     }
     case ParsedAttr::AT_HLSLResourceClass:
     case ParsedAttr::AT_HLSLROV: {
-      if (state.getSema().HLSL().handleResourceTypeAttr(attr))
+      // Only collect HLSL resource type attributes that are in
+      // decl-specifier-seq; do not collect attributes on declarations or those
+      // that get to slide after declaration name.
+      if (TAL == TAL_DeclSpec &&
+          state.getSema().HLSL().handleResourceTypeAttr(attr))
         attr.setUsedAsTypeAttr();
       break;
     }
diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
index 24c85c6ccf7d7..cf21ec4d380db 100644
--- a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
 
 // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:6:3, col:68> col:68 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t'
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:6:3, col:68> col:68 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov]]':'__hlsl_resource_t'
 struct MyBuffer {
   __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h;
 };
 
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:10:1, col:66> col:66 res '__hlsl_resource_t {{\[\[}}hlsl::resource_class(SRV)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t'
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:10:1, col:66> col:66 res '__hlsl_resource_t {{\[\[}}hlsl::resource_class(SRV)]] {{\[\[}}hlsl::is_rov]]':'__hlsl_resource_t'
 __hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res;
 
 // CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:14:1, line:16:1> line:14:6 f 'void ()
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:72> col:72 r '__hlsl_resource_t {{\[\[}}hlsl::resource_class(Sampler)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t'
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:72> col:72 r '__hlsl_resource_t {{\[\[}}hlsl::resource_class(Sampler)]] {{\[\[}}hlsl::is_rov]]':'__hlsl_resource_t'
 void f() {
   __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r;
 }
diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
index 68b2d9ecb190a..15685bd1a3baa 100644
--- a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify
 
 // expected-error@+1{{'is_rov' attribute cannot be applied to a declaration}}
-[[hlsl::is_rov()]] __hlsl_resource_t res0;
+[[hlsl::is_rov]] __hlsl_resource_t res0;
 
 // expected-error@+1{{HLSL resource needs to have [[hlsl::resource_class()]] attribute}}
-__hlsl_resource_t [[hlsl::is_rov()]] res1;
+__hlsl_resource_t [[hlsl::is_rov]] res1;
 
 // expected-error@+1{{'is_rov' attribute takes no arguments}}
 __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2;
@@ -12,5 +12,5 @@ __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2;
 // expected-error@+1{{use of undeclared identifier 'gibberish'}}
 __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3;
 
-// duplicate attribute with the same meaning - no error
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov()]] [[hlsl::is_rov()]] res4;
+// expected-warning@+1{{attribute 'is_rov' is already applied}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] [[hlsl::is_rov]] res4;
diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
index 6324a11fc8a2d..7c3830a291970 100644
--- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
@@ -11,6 +11,6 @@ RWBuffer<float> Buffer1;
 // CHECK: -TemplateArgument type 'vector<float, 4>'
 // CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector<float, 4>' 4
 // CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
-// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'vector<float *, 4> {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov()]]':'vector<float *, 4>'
+// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'vector<float *, 4> {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov]]':'vector<float *, 4>'
 // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
 RasterizerOrderedBuffer<vector<float, 4> > BufferArray3[4];

From 0b12cd227e593f5518da5170a399730bb314223e Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Tue, 10 Sep 2024 13:02:15 -0700
Subject: [PATCH 002/114] [rtsan] Ensure pthread is initialized in test
 (#108040)

---
 .../lib/rtsan/tests/rtsan_test_interceptors.cpp       | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
index 0eeaf9da67098..1ef4c66a28de8 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
@@ -472,11 +472,12 @@ TEST_F(PthreadMutexLockTest, PthreadMutexUnlockSurvivesWhenNotRealtime) {
   ExpectNonRealtimeSurvival(Func);
 }
 
-TEST(TestRtsanInterceptors, PthreadMutexJoinDiesWhenRealtime) {
-  auto Func = []() {
-    pthread_t thread{};
-    pthread_join(thread, nullptr);
-  };
+TEST(TestRtsanInterceptors, PthreadJoinDiesWhenRealtime) {
+  pthread_t thread{};
+  ASSERT_EQ(0,
+            pthread_create(&thread, nullptr, &FakeThreadEntryPoint, nullptr));
+
+  auto Func = [&thread]() { pthread_join(thread, nullptr); };
 
   ExpectRealtimeDeath(Func, "pthread_join");
   ExpectNonRealtimeSurvival(Func);

From b9703cb1a535b72f6f0812322225e50f9e325850 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 10 Sep 2024 20:04:01 +0000
Subject: [PATCH 003/114] [gn build] Port becb03f3c624

---
 llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
index 28f250ad3b7ba..ff4f558ca2fcf 100644
--- a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
@@ -110,6 +110,7 @@ static_library("CodeGen") {
     "Targets/AVR.cpp",
     "Targets/BPF.cpp",
     "Targets/CSKY.cpp",
+    "Targets/DirectX.cpp",
     "Targets/Hexagon.cpp",
     "Targets/Lanai.cpp",
     "Targets/LoongArch.cpp",

From cb3eb068e6b008b4784a93ac181516ae69350bf1 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 10 Sep 2024 20:04:02 +0000
Subject: [PATCH 004/114] [gn build] Port f4e2d7bfc143

---
 llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn
index 009aba221a0bc..e296a7b93c760 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn
@@ -18,6 +18,7 @@ static_library("Coroutines") {
     "CoroFrame.cpp",
     "CoroSplit.cpp",
     "Coroutines.cpp",
+    "SpillUtils.cpp",
     "SuspendCrossingInfo.cpp",
   ]
 }

From 6dacc382f5158b28550c25cd452848f4ab3ecd63 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Tue, 10 Sep 2024 13:09:27 -0700
Subject: [PATCH 005/114] [OpenACC] Properly ignore side-effects in clause
 arguments

The OpenACC standard makes depending on side effects to be effectively
UB, so this patch ensures we handle them reaonably by making it a potentially
evaluated context, and ignoring cleanups.
---
 clang/lib/Sema/SemaOpenACC.cpp                | 12 ++++++++++
 .../SemaOpenACC/compute-construct-ast.cpp     | 23 ++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index cf207be33175c..e1fc9cea1eb2b 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -1210,6 +1210,10 @@ ExprResult SemaOpenACC::CheckReductionVar(Expr *VarExpr) {
 
 void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
                                  SourceLocation DirLoc) {
+  // Start an evaluation context to parse the clause arguments on.
+  SemaRef.PushExpressionEvaluationContext(
+      Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
+
   switch (K) {
   case OpenACCDirectiveKind::Invalid:
     // Nothing to do here, an invalid kind has nothing we can check here.  We
@@ -1626,6 +1630,8 @@ ExprResult SemaOpenACC::ActOnArraySectionExpr(Expr *Base, SourceLocation LBLoc,
 
 bool SemaOpenACC::ActOnStartStmtDirective(OpenACCDirectiveKind K,
                                           SourceLocation StartLoc) {
+  SemaRef.DiscardCleanupsInEvaluationContext();
+  SemaRef.PopExpressionEvaluationContext();
   return diagnoseConstructAppertainment(*this, K, StartLoc, /*IsStmt=*/true);
 }
 
@@ -1649,6 +1655,7 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K,
         ParentlessLoopConstructs);
 
     ParentlessLoopConstructs.clear();
+
     return ComputeConstruct;
   }
   case OpenACCDirectiveKind::Loop: {
@@ -1704,6 +1711,11 @@ StmtResult SemaOpenACC::ActOnAssociatedStmt(SourceLocation DirectiveLoc,
 
 bool SemaOpenACC::ActOnStartDeclDirective(OpenACCDirectiveKind K,
                                           SourceLocation StartLoc) {
+  // OpenCC3.3 2.1 (line 889)
+  // A program must not depend on the order of evaluation of expressions in
+  // clause arguments or on any side effects of the evaluations.
+  SemaRef.DiscardCleanupsInEvaluationContext();
+  SemaRef.PopExpressionEvaluationContext();
   return diagnoseConstructAppertainment(*this, K, StartLoc, /*IsStmt=*/false);
 }
 
diff --git a/clang/test/SemaOpenACC/compute-construct-ast.cpp b/clang/test/SemaOpenACC/compute-construct-ast.cpp
index e632522f877b5..7a33aeb80570c 100644
--- a/clang/test/SemaOpenACC/compute-construct-ast.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-ast.cpp
@@ -117,5 +117,26 @@ struct S {
 void use() {
   TemplFunc<S>();
 }
-#endif
 
+struct HasCtor { HasCtor(); operator int(); ~HasCtor();};
+
+void useCtorType() {
+  // CHECK-LABEL: useCtorType
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc kernels num_workers(HasCtor{})
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} kernels
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator int
+  // CHECK-NEXT: MaterializeTemporaryExpr{{.*}}'HasCtor'
+  // CHECK-NEXT: CXXBindTemporaryExpr{{.*}}'HasCtor'
+  // CHECK-NEXT: CXXTemporaryObjectExpr{{.*}}'HasCtor'
+
+  while(true);
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+}
+#endif

From 27a01f6b4c47baefc347e47e4d38ea26bb721b2d Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Tue, 10 Sep 2024 17:11:49 -0300
Subject: [PATCH 006/114] [clang] correct argument offset for function template
 partial ordering (#107972)

---
 clang/lib/Sema/SemaTemplateDeduction.cpp | 28 ++++++++++++------------
 clang/test/SemaTemplate/GH18291.cpp      | 27 ++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 4c88159ea4ced..562c57a41299a 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -5502,10 +5502,6 @@ static TemplateDeductionResult CheckDeductionConsistency(
     ArrayRef<TemplateArgument> DeducedArgs, bool CheckConsistency) {
   MultiLevelTemplateArgumentList MLTAL(FTD, DeducedArgs,
                                        /*Final=*/true);
-  if (ArgIdx != -1)
-    if (auto *MD = dyn_cast<CXXMethodDecl>(FTD->getTemplatedDecl());
-        MD && MD->isImplicitObjectMemberFunction())
-      ArgIdx -= 1;
   Sema::ArgumentPackSubstitutionIndexRAII PackIndex(
       S, ArgIdx != -1 ? ::getPackIndexForParam(S, FTD, MLTAL, ArgIdx) : -1);
   bool IsIncompleteSubstitution = false;
@@ -5576,12 +5572,10 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
 
 /// Determine whether the function template \p FT1 is at least as
 /// specialized as \p FT2.
-static bool isAtLeastAsSpecializedAs(Sema &S, SourceLocation Loc,
-                                     FunctionTemplateDecl *FT1,
-                                     FunctionTemplateDecl *FT2,
-                                     TemplatePartialOrderingContext TPOC,
-                                     ArrayRef<QualType> Args1,
-                                     ArrayRef<QualType> Args2) {
+static bool isAtLeastAsSpecializedAs(
+    Sema &S, SourceLocation Loc, FunctionTemplateDecl *FT1,
+    FunctionTemplateDecl *FT2, TemplatePartialOrderingContext TPOC,
+    ArrayRef<QualType> Args1, ArrayRef<QualType> Args2, bool Args1Offset) {
   FunctionDecl *FD1 = FT1->getTemplatedDecl();
   FunctionDecl *FD2 = FT2->getTemplatedDecl();
   const FunctionProtoType *Proto1 = FD1->getType()->getAs<FunctionProtoType>();
@@ -5676,6 +5670,8 @@ static bool isAtLeastAsSpecializedAs(Sema &S, SourceLocation Loc,
                       TemplateDeductionInfo &Info,
                       SmallVectorImpl<DeducedTemplateArgument> &Deduced,
                       PartialOrderingKind) {
+                    if (ArgIdx != -1)
+                      ArgIdx -= Args1Offset;
                     return ::CheckDeductionConsistency(
                         S, FTD, ArgIdx, P, A, DeducedArgs,
                         /*CheckConsistency=*/HasDeducedParam[ParamIdx]);
@@ -5763,6 +5759,8 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate(
   const FunctionDecl *FD2 = FT2->getTemplatedDecl();
   bool ShouldConvert1 = false;
   bool ShouldConvert2 = false;
+  bool Args1Offset = false;
+  bool Args2Offset = false;
   QualType Obj1Ty;
   QualType Obj2Ty;
   if (TPOC == TPOC_Call) {
@@ -5811,6 +5809,7 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate(
         Obj1Ty = GetImplicitObjectParameterType(this->Context, Method1,
                                                 RawObj1Ty, IsRValRef2);
         Args1.push_back(Obj1Ty);
+        Args1Offset = true;
       }
       if (ShouldConvert2) {
         bool IsRValRef1 =
@@ -5821,6 +5820,7 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate(
         Obj2Ty = GetImplicitObjectParameterType(this->Context, Method2,
                                                 RawObj2Ty, IsRValRef1);
         Args2.push_back(Obj2Ty);
+        Args2Offset = true;
       }
     } else {
       if (NonStaticMethod1 && Method1->hasCXXExplicitFunctionObjectParameter())
@@ -5842,10 +5842,10 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate(
   } else {
     assert(!Reversed && "Only call context could have reversed arguments");
   }
-  bool Better1 =
-      isAtLeastAsSpecializedAs(*this, Loc, FT1, FT2, TPOC, Args1, Args2);
-  bool Better2 =
-      isAtLeastAsSpecializedAs(*this, Loc, FT2, FT1, TPOC, Args2, Args1);
+  bool Better1 = isAtLeastAsSpecializedAs(*this, Loc, FT1, FT2, TPOC, Args1,
+                                          Args2, Args2Offset);
+  bool Better2 = isAtLeastAsSpecializedAs(*this, Loc, FT2, FT1, TPOC, Args2,
+                                          Args1, Args1Offset);
   // C++ [temp.deduct.partial]p10:
   //   F is more specialized than G if F is at least as specialized as G and G
   //   is not at least as specialized as F.
diff --git a/clang/test/SemaTemplate/GH18291.cpp b/clang/test/SemaTemplate/GH18291.cpp
index ca1e69e4ca3f5..820564ffa6f1a 100644
--- a/clang/test/SemaTemplate/GH18291.cpp
+++ b/clang/test/SemaTemplate/GH18291.cpp
@@ -86,4 +86,29 @@ namespace func_pointer {
     template <class _Tp> void pow(_Tp, complex<typename __promote<_Tp>::type>) = delete;
     void (*ptr)(const complex<float> &, complex<float>){pow};
   } // namespace param
-} // namespace t3
+} // namespace func_pointer
+
+namespace static_vs_nonstatic {
+  namespace implicit_obj_param {
+    struct A {
+      template <class... Args>
+        static void f(int a, Args... args) {}
+      template <class... Args>
+        void f(Args... args) = delete;
+    };
+    void g(){
+      A::f(0);
+    }
+  } // namespace implicit_obj_param
+  namespace explicit_obj_param {
+    struct A {
+      template <class... Args>
+        static void f(int, Args... args) {}
+      template <class... Args>
+        void f(this A *, Args... args) = delete;
+    };
+    void g(){
+      A::f(0);
+    }
+  } // namespace explicit_obj_param
+} // namespace static_vs_nonstatic

From 3363760f9a00c5d4dac1e08d44f9d79b8e322511 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Tue, 10 Sep 2024 13:17:26 -0700
Subject: [PATCH 007/114] [SandboxIR] PassManager (#107932)

This patch implements a simple pass manager for Sandbox IR.
---
 llvm/include/llvm/SandboxIR/Pass.h        |  4 +-
 llvm/include/llvm/SandboxIR/PassManager.h | 70 +++++++++++++++++++++++
 llvm/lib/SandboxIR/CMakeLists.txt         |  1 +
 llvm/lib/SandboxIR/PassManager.cpp        | 22 +++++++
 llvm/unittests/SandboxIR/PassTest.cpp     | 51 +++++++++++++++++
 5 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 llvm/include/llvm/SandboxIR/PassManager.h
 create mode 100644 llvm/lib/SandboxIR/PassManager.cpp

diff --git a/llvm/include/llvm/SandboxIR/Pass.h b/llvm/include/llvm/SandboxIR/Pass.h
index d659e96839213..caf1c70a84147 100644
--- a/llvm/include/llvm/SandboxIR/Pass.h
+++ b/llvm/include/llvm/SandboxIR/Pass.h
@@ -37,8 +37,8 @@ class Pass {
     Pass.print(OS);
     return OS;
   }
-  void print(raw_ostream &OS) const { OS << Name; }
-  LLVM_DUMP_METHOD void dump() const;
+  virtual void print(raw_ostream &OS) const { OS << Name; }
+  LLVM_DUMP_METHOD virtual void dump() const;
 #endif
 };
 
diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h
new file mode 100644
index 0000000000000..cb321fe699a56
--- /dev/null
+++ b/llvm/include/llvm/SandboxIR/PassManager.h
@@ -0,0 +1,70 @@
+//===- PassManager.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Registers and executes the Sandbox IR passes.
+//
+// The pass manager contains an ordered sequence of passes that it runs in
+// order. The passes are owned by the PassRegistry, not by the PassManager.
+//
+// Note that in this design a pass manager is also a pass. So a pass manager
+// runs when it is it's turn to run in its parent pass-manager pass pipeline.
+//
+
+#ifndef LLVM_SANDBOXIR_PASSMANAGER_H
+#define LLVM_SANDBOXIR_PASSMANAGER_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/SandboxIR/Pass.h"
+#include "llvm/Support/Debug.h"
+
+namespace llvm::sandboxir {
+
+class Value;
+
+/// Base class.
+template <typename ParentPass, typename ContainedPass>
+class PassManager : public ParentPass {
+protected:
+  /// The list of passes that this pass manager will run.
+  SmallVector<ContainedPass *> Passes;
+
+  PassManager(StringRef Name) : ParentPass(Name) {}
+  PassManager(const PassManager &) = delete;
+  virtual ~PassManager() = default;
+  PassManager &operator=(const PassManager &) = delete;
+
+public:
+  /// Adds \p Pass to the pass pipeline.
+  void addPass(ContainedPass *Pass) {
+    // TODO: Check that Pass's class type works with this PassManager type.
+    Passes.push_back(Pass);
+  }
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const override {
+    OS << this->getName();
+    OS << "(";
+    interleave(Passes, OS, [&OS](auto *Pass) { OS << Pass->getName(); }, ",");
+    OS << ")";
+  }
+  LLVM_DUMP_METHOD void dump() const override {
+    print(dbgs());
+    dbgs() << "\n";
+  }
+#endif
+};
+
+class FunctionPassManager final
+    : public PassManager<FunctionPass, FunctionPass> {
+public:
+  FunctionPassManager(StringRef Name) : PassManager(Name) {}
+  bool runOnFunction(Function &F) final;
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_SANDBOXIR_PASSMANAGER_H
diff --git a/llvm/lib/SandboxIR/CMakeLists.txt b/llvm/lib/SandboxIR/CMakeLists.txt
index 2f047944e0335..03474be0c7b80 100644
--- a/llvm/lib/SandboxIR/CMakeLists.txt
+++ b/llvm/lib/SandboxIR/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_component_library(LLVMSandboxIR
   Pass.cpp
+  PassManager.cpp
   SandboxIR.cpp
   Tracker.cpp
   Type.cpp
diff --git a/llvm/lib/SandboxIR/PassManager.cpp b/llvm/lib/SandboxIR/PassManager.cpp
new file mode 100644
index 0000000000000..d10f3926f7bcd
--- /dev/null
+++ b/llvm/lib/SandboxIR/PassManager.cpp
@@ -0,0 +1,22 @@
+//===- PassManager.cpp - Runs a pipeline of Sandbox IR passes -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/PassManager.h"
+#include "llvm/SandboxIR/SandboxIR.h"
+
+using namespace llvm::sandboxir;
+
+bool FunctionPassManager::runOnFunction(Function &F) {
+  bool Change = false;
+  for (FunctionPass *Pass : Passes) {
+    Change |= Pass->runOnFunction(F);
+    // TODO: run the verifier.
+  }
+  // TODO: Check ChangeAll against hashes before/after.
+  return Change;
+}
diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp
index 65992d8cb95ee..8e080128b15b3 100644
--- a/llvm/unittests/SandboxIR/PassTest.cpp
+++ b/llvm/unittests/SandboxIR/PassTest.cpp
@@ -9,6 +9,7 @@
 #include "llvm/SandboxIR/Pass.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Module.h"
+#include "llvm/SandboxIR/PassManager.h"
 #include "llvm/SandboxIR/SandboxIR.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
@@ -82,3 +83,53 @@ define void @foo() {
   EXPECT_DEATH(TestNamePass("-dash"), ".*start with.*");
 #endif
 }
+
+TEST_F(PassTest, FunctionPassManager) {
+  auto *F = parseFunction(R"IR(
+define void @foo() {
+  ret void
+}
+)IR",
+                          "foo");
+  class TestPass1 final : public FunctionPass {
+    unsigned &BBCnt;
+
+  public:
+    TestPass1(unsigned &BBCnt) : FunctionPass("test-pass1"), BBCnt(BBCnt) {}
+    bool runOnFunction(Function &F) final {
+      for ([[maybe_unused]] auto &BB : F)
+        ++BBCnt;
+      return false;
+    }
+  };
+  class TestPass2 final : public FunctionPass {
+    unsigned &BBCnt;
+
+  public:
+    TestPass2(unsigned &BBCnt) : FunctionPass("test-pass2"), BBCnt(BBCnt) {}
+    bool runOnFunction(Function &F) final {
+      for ([[maybe_unused]] auto &BB : F)
+        ++BBCnt;
+      return false;
+    }
+  };
+  unsigned BBCnt1 = 0;
+  unsigned BBCnt2 = 0;
+  TestPass1 TPass1(BBCnt1);
+  TestPass2 TPass2(BBCnt2);
+
+  FunctionPassManager FPM("test-fpm");
+  FPM.addPass(&TPass1);
+  FPM.addPass(&TPass2);
+  // Check runOnFunction().
+  FPM.runOnFunction(*F);
+  EXPECT_EQ(BBCnt1, 1u);
+  EXPECT_EQ(BBCnt2, 1u);
+#ifndef NDEBUG
+  // Check dump().
+  std::string Buff;
+  llvm::raw_string_ostream SS(Buff);
+  FPM.print(SS);
+  EXPECT_EQ(Buff, "test-fpm(test-pass1,test-pass2)");
+#endif // NDEBUG
+}

From 99fb1506a869fa5e82dbd36e1a63cd21450f1502 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 10 Sep 2024 20:18:56 +0000
Subject: [PATCH 008/114] [gn build] Port 3363760f9a00

---
 llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
index aa49c76e84cec..e69104909330d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
@@ -6,6 +6,7 @@ static_library("SandboxIR") {
   ]
   sources = [
     "Pass.cpp",
+    "PassManager.cpp",
     "SandboxIR.cpp",
     "Tracker.cpp",
     "Type.cpp",

From e3c537ff903af9a92ff43bab6d21c0ea759d65e5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 10 Sep 2024 21:37:12 +0100
Subject: [PATCH 009/114] [VPlan] Consider non-header phis in
 planContainsAdditionalSimp.

Update planContainsAdditionalSimplifications to also check phis not in
the loop header. This ensures we don't miss cases where VPBlendRecipes
(which correspond to such phis) have been simplified.

Fixes https://github.com/llvm/llvm-project/issues/107473.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  7 +-
 .../LoopVectorize/RISCV/dead-ops-cost.ll      | 82 +++++++++++++++++++
 2 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2be3b57752925..b821da03c16e9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7314,9 +7314,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
   // Return true if the loop contains any instructions that are not also part of
   // the VPlan or are skipped for VPlan-based cost computations. This indicates
   // that the VPlan contains extra simplifications.
-  return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx](BasicBlock *BB) {
-    return any_of(*BB, [&SeenInstrs, &CostCtx](Instruction &I) {
-      if (isa<PHINode>(&I))
+  return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
+                                    TheLoop](BasicBlock *BB) {
+    return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
+      if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
         return false;
       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
     });
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 54c7299f6db0f..6d309c4453c7e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -322,7 +322,87 @@ return:
   ret i32 0
 }
 
+; Test case for https://github.com/llvm/llvm-project/issues/107473.
+define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
+; CHECK-LABEL: define void @test_phi_in_latch_redundant(
+; CHECK-SAME: ptr [[DST:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 37, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 37, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 37, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 9
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 9, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 9, [[TMP5]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i32> [[BROADCAST_SPLAT]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 -1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[TMP10]], <vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 37, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    br i1 false, label %[[LOOP_LATCH]], label %[[THEN:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[NOT_A:%.*]] = xor i32 [[A]], -1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[NOT_A]], %[[THEN]] ], [ 0, %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[P]], ptr [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 9
+; CHECK-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV]], 322
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
 
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 false, label %loop.latch, label %then
+
+then:
+  %not.a = xor i32 %a, -1
+  br label %loop.latch
+
+loop.latch:
+  %p = phi i32 [ %not.a, %then ], [ 0, %loop.header ]
+  %gep = getelementptr i32, ptr %dst, i64 %iv
+  store i32 %p, ptr %gep, align 4
+  %iv.next = add i64 %iv, 9
+  %ec = icmp slt i64 %iv, 322
+  br i1 %ec, label %loop.header, label %exit
+
+exit:
+  ret void
+}
 
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
@@ -343,4 +423,6 @@ return:
 ; CHECK: [[META15]] = distinct !{[[META15]], [[META13]]}
 ; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]}
+; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
 ;.

From 0f56ba13bff7ab72bfafcf7c5cf9e5b8bd16d895 Mon Sep 17 00:00:00 2001
From: "Henrik G. Olsson" <hnrklssn@gmail.com>
Date: Tue, 10 Sep 2024 13:38:04 -0700
Subject: [PATCH 010/114] [llvm-lit] Process ANSI color codes in test output
 when formatting (#106776)

Test output that carried color across newlines previously resulted in
the formatting around the output also being colored. Detect the current
ANSI color and reset it when printing formatting, and then reapply it.
As an added bonus an unterminated color code is also detected,
preventing it from leaking out into the rest of the terminal.

Fixes #106633
---
 llvm/utils/lit/lit/TestRunner.py              | 28 +++++++++++++++++--
 .../Inputs/escape-color/color-escaped.txt     | 10 +++++++
 .../lit/tests/Inputs/escape-color/color.txt   |  6 ++++
 .../lit/tests/Inputs/escape-color/lit.cfg     |  8 ++++++
 llvm/utils/lit/tests/escape-color.py          |  4 +++
 5 files changed, 54 insertions(+), 2 deletions(-)
 create mode 100644 llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/escape-color/color.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/escape-color/lit.cfg
 create mode 100644 llvm/utils/lit/tests/escape-color.py

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 19f35fc7e212f..a2c76d41a43e0 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -1017,6 +1017,20 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
     return exitCode
 
 
+def findColor(line, curr_color):
+    start = line.rfind("\33[")
+    if start == -1:
+        return curr_color
+    end = line.find("m", start+2)
+    if end == -1:
+        return curr_color
+    match = line[start:end+1]
+    # "\33[0m" means "reset all formatting". Sometimes the 0 is skipped.
+    if match == "\33[m" or match == "\33[0m":
+        return None
+    return match
+
+
 def formatOutput(title, data, limit=None):
     if not data.strip():
         return ""
@@ -1027,8 +1041,18 @@ def formatOutput(title, data, limit=None):
         msg = ""
     ndashes = 30
     # fmt: off
-    out =  f"# .---{title}{'-' * (ndashes - 4 - len(title))}\n"
-    out += f"# | " + "\n# | ".join(data.splitlines()) + "\n"
+    out = f"# .---{title}{'-' * (ndashes - 4 - len(title))}\n"
+    curr_color = None
+    for line in data.splitlines():
+        if curr_color:
+            out += "\33[0m"
+        out += "# | "
+        if curr_color:
+            out += curr_color
+        out += line + "\n"
+        curr_color = findColor(line, curr_color)
+    if curr_color:
+        out += "\33[0m"  # prevent unterminated formatting from leaking
     out += f"# `---{msg}{'-' * (ndashes - 4 - len(msg))}\n"
     # fmt: on
     return out
diff --git a/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt b/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt
new file mode 100644
index 0000000000000..e7a33e380b351
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt
@@ -0,0 +1,10 @@
+# .---command stdout------------
+# | # RUN: cat %s
+# | [31mred
+[0m# | [31mstill red(B[m
+# | plain
+# | [32mgreen
+[0m# | [32mstill green (never terminated)
+[0m# `-----------------------------
+
+--
diff --git a/llvm/utils/lit/tests/Inputs/escape-color/color.txt b/llvm/utils/lit/tests/Inputs/escape-color/color.txt
new file mode 100644
index 0000000000000..15ffc22d134f0
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/escape-color/color.txt
@@ -0,0 +1,6 @@
+# RUN: cat %s
+[31mred
+still red(B[m
+plain
+[32mgreen
+still green (never terminated)
diff --git a/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg b/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg
new file mode 100644
index 0000000000000..36f4eb69d4858
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg
@@ -0,0 +1,8 @@
+import lit.formats
+
+config.name = "escape-color"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+
diff --git a/llvm/utils/lit/tests/escape-color.py b/llvm/utils/lit/tests/escape-color.py
new file mode 100644
index 0000000000000..1d0b93b004e9d
--- /dev/null
+++ b/llvm/utils/lit/tests/escape-color.py
@@ -0,0 +1,4 @@
+# cut off the first 9 lines to avoid absolute file paths in the output
+# then keep only the next 10 lines to avoid test timing in the output
+# RUN: %{lit} %{inputs}/escape-color/color.txt -a | tail -n +10 | head -n 10 > %t
+# RUN: diff %{inputs}/escape-color/color-escaped.txt %t

From b5ce7a9fbd898767d0c1fa13ef5c33bbbe67981f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 10 Sep 2024 14:07:26 -0700
Subject: [PATCH 011/114] [flang][cuda] Avoid generating data transfer when
 calling size intrinsic (#108081)

cuf.data_transfer was wrongly generated when calling the `size`
intrinsic on a device allocatable variable. Since the descriptor is
available on the host, there is no transfer needed.

Add `DescriptorInquiry` in the `CollectCudaSymbolsHelper` to filter out
symbols that are not needed for the transfer decision to be made.
---
 flang/lib/Evaluate/tools.cpp                 |  3 +++
 flang/test/Lower/CUDA/cuda-data-transfer.cuf | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 6b3db619c1e2f..400f27aef98da 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1011,6 +1011,9 @@ struct CollectCudaSymbolsHelper : public SetTraverse<CollectCudaSymbolsHelper,
   }
   // Overload some of the operator() to filter out the symbols that are not
   // of interest for CUDA data transfer logic.
+  semantics::UnorderedSymbolSet operator()(const DescriptorInquiry &) const {
+    return {};
+  }
   semantics::UnorderedSymbolSet operator()(const Subscript &) const {
     return {};
   }
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index bcbac9c303142..11f1f33d7cb58 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -353,3 +353,13 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPsub17()
 ! CHECK: cuf.kernel<<<*, *>>>
 ! CHECK-NOT: cuf.data_transfer
+
+subroutine sub18()
+  integer, device, allocatable :: a(:)
+  integer :: isz
+
+  isz = size(a)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub18()
+! CHECK-NOT: cuf.data_transfer

From d8a8eae6daa6523b28526e2b8f65879e74858f68 Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh <dthorn@google.com>
Date: Tue, 10 Sep 2024 14:08:27 -0700
Subject: [PATCH 012/114] Revert "[libc++][string] Remove potential
 non-trailing 0-length array" (#108091)

Reverts llvm/llvm-project#105865

This breaks a pair of LLDB tests in CI.
---
 libcxx/include/string | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/libcxx/include/string b/libcxx/include/string
index aba79a74912f5..3480b57375c11 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -749,14 +749,6 @@ struct __can_be_converted_to_string_view
 struct __uninitialized_size_tag {};
 struct __init_with_sentinel_tag {};
 
-template <size_t _PaddingSize>
-struct __padding {
-  char __padding_[_PaddingSize];
-};
-
-template <>
-struct __padding<0> {};
-
 template <class _CharT, class _Traits, class _Allocator>
 class basic_string {
 private:
@@ -861,7 +853,7 @@ private:
 
   struct __short {
     value_type __data_[__min_cap];
-    _LIBCPP_NO_UNIQUE_ADDRESS __padding<sizeof(value_type) - 1> __padding_;
+    unsigned char __padding_[sizeof(value_type) - 1];
     unsigned char __size_    : 7;
     unsigned char __is_long_ : 1;
   };
@@ -913,7 +905,7 @@ private:
       unsigned char __is_long_ : 1;
       unsigned char __size_    : 7;
     };
-    _LIBCPP_NO_UNIQUE_ADDRESS __padding<sizeof(value_type) - 1> __padding_;
+    char __padding_[sizeof(value_type) - 1];
     value_type __data_[__min_cap];
   };
 

From ce392471c0d9cb3ef88d05fcbcff59de8ea0c1e1 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 14:08:55 -0700
Subject: [PATCH 013/114] [flang] Silence spurious error on non-CUDA use of
 CUDA module (#107444)

When a module file has been compiled with CUDA enabled, don't emit
spurious errors about non-interoperable types when that module is read
by a USE statement in a later non-CUDA compilation.
---
 flang/include/flang/Semantics/type.h       |  3 ---
 flang/lib/Evaluate/type.cpp                |  4 ++--
 flang/lib/Semantics/check-declarations.cpp | 21 +++++++++++++--------
 flang/lib/Semantics/expression.cpp         |  4 ++--
 flang/lib/Semantics/type.cpp               |  9 ---------
 flang/test/Semantics/Inputs/modfile66.cuf  |  4 ++++
 flang/test/Semantics/modfile66.f90         |  3 +++
 7 files changed, 24 insertions(+), 24 deletions(-)
 create mode 100644 flang/test/Semantics/Inputs/modfile66.cuf
 create mode 100644 flang/test/Semantics/modfile66.f90

diff --git a/flang/include/flang/Semantics/type.h b/flang/include/flang/Semantics/type.h
index 04f8b11e992a0..e2d47d38f927f 100644
--- a/flang/include/flang/Semantics/type.h
+++ b/flang/include/flang/Semantics/type.h
@@ -459,8 +459,5 @@ inline const DerivedTypeSpec *DeclTypeSpec::AsDerived() const {
   return const_cast<DeclTypeSpec *>(this)->AsDerived();
 }
 
-std::optional<bool> IsInteroperableIntrinsicType(
-    const DeclTypeSpec &, const common::LanguageFeatureControl &);
-
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_TYPE_H_
diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index 5ecc3701b4f24..a1df40667471a 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -820,8 +820,8 @@ std::optional<bool> IsInteroperableIntrinsicType(const DynamicType &type,
     return true;
   case TypeCategory::Real:
   case TypeCategory::Complex:
-    return (features && features->IsEnabled(common::LanguageFeature::CUDA)) ||
-        type.kind() >= 4; // no short or half floats
+    return type.kind() >= 4 /* not a short or half float */ || !features ||
+        features->IsEnabled(common::LanguageFeature::CUDA);
   case TypeCategory::Logical:
     return type.kind() == 1; // C_BOOL
   case TypeCategory::Character:
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 734c34276b13b..c896ee7d29381 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -3003,17 +3003,17 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
           } else {
             msgs.Annex(std::move(bad));
           }
-        } else if (!IsInteroperableIntrinsicType(
-                       *type, context_.languageFeatures())
+        } else if (auto dyType{evaluate::DynamicType::From(*type)}; dyType &&
+                   !evaluate::IsInteroperableIntrinsicType(
+                       *dyType, &context_.languageFeatures())
                         .value_or(false)) {
-          auto maybeDyType{evaluate::DynamicType::From(*type)};
           if (type->category() == DeclTypeSpec::Logical) {
             if (context_.ShouldWarn(common::UsageWarning::LogicalVsCBool)) {
               msgs.Say(component.name(),
                   "A LOGICAL component of an interoperable type should have the interoperable KIND=C_BOOL"_port_en_US);
             }
-          } else if (type->category() == DeclTypeSpec::Character &&
-              maybeDyType && maybeDyType->kind() == 1) {
+          } else if (type->category() == DeclTypeSpec::Character && dyType &&
+              dyType->kind() == 1) {
             if (context_.ShouldWarn(common::UsageWarning::BindCCharLength)) {
               msgs.Say(component.name(),
                   "A CHARACTER component of an interoperable type should have length 1"_port_en_US);
@@ -3106,10 +3106,15 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(const Symbol &symbol) {
         type->category() == DeclTypeSpec::Character &&
         type->characterTypeSpec().length().isDeferred()) {
       // ok; F'2023 18.3.7 p2(6)
-    } else if (derived ||
-        IsInteroperableIntrinsicType(*type, context_.languageFeatures())
-            .value_or(false)) {
+    } else if (derived) { // type has been checked
+    } else if (auto dyType{evaluate::DynamicType::From(*type)}; dyType &&
+               evaluate::IsInteroperableIntrinsicType(*dyType,
+                   InModuleFile() ? nullptr : &context_.languageFeatures())
+                   .value_or(false)) {
       // F'2023 18.3.7 p2(4,5)
+      // N.B. Language features are not passed to IsInteroperableIntrinsicType
+      // when processing a module file, since the module file might have been
+      // compiled with CUDA while the client is not.
     } else if (type->category() == DeclTypeSpec::Logical) {
       if (context_.ShouldWarn(common::UsageWarning::LogicalVsCBool) &&
           !InModuleFile()) {
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 3684839c187e6..0eabe532cfe0c 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -1956,7 +1956,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::ArrayConstructor &array) {
 
 // Check if implicit conversion of expr to the symbol type is legal (if needed),
 // and make it explicit if requested.
-static MaybeExpr implicitConvertTo(const semantics::Symbol &sym,
+static MaybeExpr ImplicitConvertTo(const semantics::Symbol &sym,
     Expr<SomeType> &&expr, bool keepConvertImplicit) {
   if (!keepConvertImplicit) {
     return ConvertToType(sym, std::move(expr));
@@ -2196,7 +2196,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(
         // convert would cause a segfault. Lowering will deal with
         // conditionally converting and preserving the lower bounds in this
         // case.
-        if (MaybeExpr converted{implicitConvertTo(
+        if (MaybeExpr converted{ImplicitConvertTo(
                 *symbol, std::move(*value), IsAllocatable(*symbol))}) {
           if (auto componentShape{GetShape(GetFoldingContext(), *symbol)}) {
             if (auto valueShape{GetShape(GetFoldingContext(), *converted)}) {
diff --git a/flang/lib/Semantics/type.cpp b/flang/lib/Semantics/type.cpp
index cfaee0b8ba6dc..aa6e8973ebd30 100644
--- a/flang/lib/Semantics/type.cpp
+++ b/flang/lib/Semantics/type.cpp
@@ -893,13 +893,4 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &o, const DeclTypeSpec &x) {
   return o << x.AsFortran();
 }
 
-std::optional<bool> IsInteroperableIntrinsicType(
-    const DeclTypeSpec &type, const common::LanguageFeatureControl &features) {
-  if (auto dyType{evaluate::DynamicType::From(type)}) {
-    return IsInteroperableIntrinsicType(*dyType, &features);
-  } else {
-    return std::nullopt;
-  }
-}
-
 } // namespace Fortran::semantics
diff --git a/flang/test/Semantics/Inputs/modfile66.cuf b/flang/test/Semantics/Inputs/modfile66.cuf
new file mode 100644
index 0000000000000..be400da749148
--- /dev/null
+++ b/flang/test/Semantics/Inputs/modfile66.cuf
@@ -0,0 +1,4 @@
+module usereal2
+  !REAL(2) is interoperable under CUDA
+  real(2), bind(c) :: x
+end
diff --git a/flang/test/Semantics/modfile66.f90 b/flang/test/Semantics/modfile66.f90
new file mode 100644
index 0000000000000..51b4d8375d50d
--- /dev/null
+++ b/flang/test/Semantics/modfile66.f90
@@ -0,0 +1,3 @@
+! RUN: %flang_fc1 -fsyntax-only %S/Inputs/modfile66.cuf && %flang_fc1 -fsyntax-only %s
+use usereal2 ! valid since x is not used
+end

From 5a229dbca19f5ad7ebd15c85b432cd54e70fd09a Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 14:10:40 -0700
Subject: [PATCH 014/114] [flang] Relax error into a warning (#107489)

The standard requires that a generic interface with the same name as a
derived type contain only functions. We generally allow a generic
interface to contain both functions and subroutines, since there's never
any ambiguity at the point of call; these is helpful when the specific
procedures of two generics are combined during USE association. Emit a
warning instead of a hard error when a generic interface with the same
name as a derived type contains a subroutine to improve portability of
code from compilers that don't check for this condition.
---
 flang/docs/Extensions.md              |  5 +--
 flang/lib/Semantics/resolve-names.cpp | 44 +++++++++++++--------------
 flang/test/Semantics/resolve24.f90    |  8 ++---
 3 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index fb57744c21570..340939baca99f 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -507,10 +507,7 @@ end
   f18 supports them with a portability warning.
 * f18 does not enforce a blanket prohibition against generic
   interfaces containing a mixture of functions and subroutines.
-  Apart from some contexts in which the standard requires all of
-  a particular generic interface to have only all functions or
-  all subroutines as its specific procedures, we allow both to
-  appear, unlike several other Fortran compilers.
+  We allow both to appear, unlike several other Fortran compilers.
   This is especially desirable when two generics of the same
   name are combined due to USE association and the mixture may
   be inadvertent.
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 2e86e0afc9bd0..b764678357db3 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -3639,36 +3639,36 @@ void InterfaceVisitor::CheckGenericProcedures(Symbol &generic) {
     }
     return;
   }
-  const Symbol &firstSpecific{specifics.front()};
-  bool isFunction{firstSpecific.test(Symbol::Flag::Function)};
-  bool isBoth{false};
+  const Symbol *function{nullptr};
+  const Symbol *subroutine{nullptr};
   for (const Symbol &specific : specifics) {
-    if (isFunction != specific.test(Symbol::Flag::Function)) { // C1514
-      if (context().ShouldWarn(
+    if (!function && specific.test(Symbol::Flag::Function)) {
+      function = &specific;
+    } else if (!subroutine && specific.test(Symbol::Flag::Subroutine)) {
+      subroutine = &specific;
+      if (details.derivedType() &&
+          context().ShouldWarn(
               common::LanguageFeature::SubroutineAndFunctionSpecifics)) {
+        SayDerivedType(generic.name(),
+            "Generic interface '%s' should only contain functions due to derived type with same name"_warn_en_US,
+            *details.derivedType()->GetUltimate().scope());
+      }
+    }
+    if (function && subroutine) {
+      if (context().ShouldWarn(common::LanguageFeature::
+                  SubroutineAndFunctionSpecifics)) { // C1514
         auto &msg{Say(generic.name(),
             "Generic interface '%s' has both a function and a subroutine"_warn_en_US)};
-        if (isFunction) {
-          msg.Attach(firstSpecific.name(), "Function declaration"_en_US);
-          msg.Attach(specific.name(), "Subroutine declaration"_en_US);
-        } else {
-          msg.Attach(firstSpecific.name(), "Subroutine declaration"_en_US);
-          msg.Attach(specific.name(), "Function declaration"_en_US);
-        }
+        msg.Attach(function->name(), "Function declaration"_en_US);
+        msg.Attach(subroutine->name(), "Subroutine declaration"_en_US);
       }
-      isFunction = false;
-      isBoth = true;
       break;
     }
   }
-  if (!isFunction && details.derivedType()) {
-    SayDerivedType(generic.name(),
-        "Generic interface '%s' may only contain functions due to derived type"
-        " with same name"_err_en_US,
-        *details.derivedType()->GetUltimate().scope());
-  }
-  if (!isBoth) {
-    generic.set(isFunction ? Symbol::Flag::Function : Symbol::Flag::Subroutine);
+  if (function && !subroutine) {
+    generic.set(Symbol::Flag::Function);
+  } else if (subroutine && !function) {
+    generic.set(Symbol::Flag::Subroutine);
   }
 }
 
diff --git a/flang/test/Semantics/resolve24.f90 b/flang/test/Semantics/resolve24.f90
index 4af6f202cf4f1..72d6719665bb5 100644
--- a/flang/test/Semantics/resolve24.f90
+++ b/flang/test/Semantics/resolve24.f90
@@ -1,6 +1,6 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
 subroutine test1
-  !ERROR: Generic interface 'foo' has both a function and a subroutine
+  !WARNING: Generic interface 'foo' has both a function and a subroutine
   interface foo
     subroutine s1(x)
     end subroutine
@@ -12,7 +12,7 @@ function f()
 end subroutine
 
 subroutine test2
-  !ERROR: Generic interface 'foo' has both a function and a subroutine
+  !WARNING: Generic interface 'foo' has both a function and a subroutine
   interface foo
     function t2f1(x)
     end function
@@ -24,7 +24,7 @@ function t2f2(x, y)
 end subroutine
 
 module test3
-  !ERROR: Generic interface 'foo' has both a function and a subroutine
+  !WARNING: Generic interface 'foo' has both a function and a subroutine
   interface foo
     module procedure s
     module procedure f
@@ -39,7 +39,7 @@ function f()
 subroutine test4
   type foo
   end type
-  !ERROR: Generic interface 'foo' may only contain functions due to derived type with same name
+  !WARNING: Generic interface 'foo' should only contain functions due to derived type with same name
   interface foo
     subroutine s()
     end subroutine

From d2126ec1af543b23817c742c283ec441f21bf42b Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 14:11:10 -0700
Subject: [PATCH 015/114] [flang] Fix bogus error about procedure
 incompatbility (#107645)

This was a subtle problem. When the shape of a function result is
explicit but not constant, it is characterized with bounds expressions
that use Extremum<SubscriptInteger> operations to force extents to 0
rather than be negative. These Extremum operations are formatted as
"max()" intrinsic functions in the module file. Upon being read from the
module file, they are not folded back into Extremum operations, but
remain as function references; and this then leads to expressions not
comparing equal when the procedure characteristics are compared to those
of a local procedure declared identically.

The real fix here would be for folding to just always change max and min
function references into Extremum<> operations, constant operands or
not, and I tried that, but it lead to test failures and crashes in
lowering that I couldn't resolve. So, until those can be fixed, here's a
change that will read max/min operations in module file declarations
back into Extremum operations to solve the compatibility checking
problem, but leave other non-constant max/min operations as function
calls.
---
 flang/include/flang/Evaluate/expression.h |  3 ++
 flang/include/flang/Evaluate/tools.h      | 16 ++++++++
 flang/lib/Evaluate/expression.cpp         | 22 ++++++++--
 flang/lib/Evaluate/fold-implementation.h  | 50 +++++++++++++++--------
 flang/test/Semantics/Inputs/modfile67.mod | 16 ++++++++
 flang/test/Semantics/modfile67.f90        | 35 ++++++++++++++++
 6 files changed, 122 insertions(+), 20 deletions(-)
 create mode 100644 flang/test/Semantics/Inputs/modfile67.mod
 create mode 100644 flang/test/Semantics/modfile67.f90

diff --git a/flang/include/flang/Evaluate/expression.h b/flang/include/flang/Evaluate/expression.h
index 3ba46edba717b..2a40193e32306 100644
--- a/flang/include/flang/Evaluate/expression.h
+++ b/flang/include/flang/Evaluate/expression.h
@@ -342,6 +342,7 @@ template <typename A> struct Extremum : public Operation<Extremum<A>, A, A, A> {
       : Base{x, y}, ordering{ord} {}
   Extremum(Ordering ord, Expr<Operand> &&x, Expr<Operand> &&y)
       : Base{std::move(x), std::move(y)}, ordering{ord} {}
+  bool operator==(const Extremum &) const;
   Ordering ordering{Ordering::Greater};
 };
 
@@ -381,6 +382,7 @@ struct LogicalOperation
       : Base{x, y}, logicalOperator{opr} {}
   LogicalOperation(LogicalOperator opr, Expr<Operand> &&x, Expr<Operand> &&y)
       : Base{std::move(x), std::move(y)}, logicalOperator{opr} {}
+  bool operator==(const LogicalOperation &) const;
   LogicalOperator logicalOperator;
 };
 
@@ -634,6 +636,7 @@ class Relational : public Operation<Relational<T>, LogicalResult, T, T> {
       : Base{a, b}, opr{r} {}
   Relational(RelationalOperator r, Expr<Operand> &&a, Expr<Operand> &&b)
       : Base{std::move(a), std::move(b)}, opr{r} {}
+  bool operator==(const Relational &) const;
   RelationalOperator opr;
 };
 
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 3675d9f924876..a0487e399d936 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -218,6 +218,22 @@ template <typename A, typename B> A *UnwrapExpr(std::optional<B> &x) {
   }
 }
 
+template <typename A, typename B> const A *UnwrapExpr(const B *x) {
+  if (x) {
+    return UnwrapExpr<A>(*x);
+  } else {
+    return nullptr;
+  }
+}
+
+template <typename A, typename B> A *UnwrapExpr(B *x) {
+  if (x) {
+    return UnwrapExpr<A>(*x);
+  } else {
+    return nullptr;
+  }
+}
+
 // A variant of UnwrapExpr above that also skips through (parentheses)
 // and conversions of kinds within a category.  Useful for extracting LEN
 // type parameter inquiries, at least.
diff --git a/flang/lib/Evaluate/expression.cpp b/flang/lib/Evaluate/expression.cpp
index 5b0bc14dc3e1b..1a65d4c7362fe 100644
--- a/flang/lib/Evaluate/expression.cpp
+++ b/flang/lib/Evaluate/expression.cpp
@@ -125,6 +125,24 @@ template <typename A> LLVM_DUMP_METHOD void ExpressionBase<A>::dump() const {
 
 // Equality testing
 
+template <typename A> bool Extremum<A>::operator==(const Extremum &that) const {
+  return ordering == that.ordering && Base::operator==(that);
+}
+
+template <int KIND>
+bool LogicalOperation<KIND>::operator==(const LogicalOperation &that) const {
+  return logicalOperator == that.logicalOperator && Base::operator==(that);
+}
+
+template <typename A>
+bool Relational<A>::operator==(const Relational &that) const {
+  return opr == that.opr && Base::operator==(that);
+}
+
+bool Relational<SomeType>::operator==(const Relational &that) const {
+  return u == that.u;
+}
+
 bool ImpliedDoIndex::operator==(const ImpliedDoIndex &that) const {
   return name == that.name;
 }
@@ -181,10 +199,6 @@ bool StructureConstructor::operator==(const StructureConstructor &that) const {
   return result_ == that.result_ && values_ == that.values_;
 }
 
-bool Relational<SomeType>::operator==(const Relational<SomeType> &that) const {
-  return u == that.u;
-}
-
 template <int KIND>
 bool Expr<Type<TypeCategory::Integer, KIND>>::operator==(
     const Expr<Type<TypeCategory::Integer, KIND>> &that) const {
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 9ce0edbdcb779..1b14a305b87f4 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -1088,24 +1088,42 @@ Expr<T> FoldMINorMAX(
   static_assert(T::category == TypeCategory::Integer ||
       T::category == TypeCategory::Real ||
       T::category == TypeCategory::Character);
-  std::vector<Constant<T> *> constantArgs;
-  // Call Folding on all arguments, even if some are not constant,
-  // to make operand promotion explicit.
-  for (auto &arg : funcRef.arguments()) {
-    if (auto *cst{Folder<T>{context}.Folding(arg)}) {
-      constantArgs.push_back(cst);
+  auto &args{funcRef.arguments()};
+  bool ok{true};
+  std::optional<Expr<T>> result;
+  Folder<T> folder{context};
+  for (std::optional<ActualArgument> &arg : args) {
+    // Call Folding on all arguments to make operand promotion explicit.
+    if (!folder.Folding(arg)) {
+      // TODO: Lowering can't handle having every FunctionRef for max and min
+      // being converted into Extremum<T>.  That needs fixing.  Until that
+      // is corrected, however, it is important that max and min references
+      // in module files be converted into Extremum<T> even when not constant;
+      // the Extremum<SubscriptInteger> operations created to normalize the
+      // values of array bounds are formatted as max operations in the
+      // declarations in modules, and need to be read back in as such in
+      // order for expression comparison to not produce false inequalities
+      // when checking function results for procedure interface compatibility.
+      if (!context.moduleFileName()) {
+        ok = false;
+      }
+    }
+    Expr<SomeType> *argExpr{arg ? arg->UnwrapExpr() : nullptr};
+    if (argExpr) {
+      *argExpr = Fold(context, std::move(*argExpr));
+    }
+    if (Expr<T> * tExpr{UnwrapExpr<Expr<T>>(argExpr)}) {
+      if (result) {
+        result = FoldOperation(
+            context, Extremum<T>{order, std::move(*result), Expr<T>{*tExpr}});
+      } else {
+        result = Expr<T>{*tExpr};
+      }
+    } else {
+      ok = false;
     }
   }
-  if (constantArgs.size() != funcRef.arguments().size()) {
-    return Expr<T>(std::move(funcRef));
-  }
-  CHECK(!constantArgs.empty());
-  Expr<T> result{std::move(*constantArgs[0])};
-  for (std::size_t i{1}; i < constantArgs.size(); ++i) {
-    Extremum<T> extremum{order, result, Expr<T>{std::move(*constantArgs[i])}};
-    result = FoldOperation(context, std::move(extremum));
-  }
-  return result;
+  return ok && result ? std::move(*result) : Expr<T>{std::move(funcRef)};
 }
 
 // For AMAX0, AMIN0, AMAX1, AMIN1, DMAX1, DMIN1, MAX0, MIN0, MAX1, and MIN1
diff --git a/flang/test/Semantics/Inputs/modfile67.mod b/flang/test/Semantics/Inputs/modfile67.mod
new file mode 100644
index 0000000000000..1aa0158e35089
--- /dev/null
+++ b/flang/test/Semantics/Inputs/modfile67.mod
@@ -0,0 +1,16 @@
+﻿!mod$ v1 sum:37cfecee3234c8ab
+module modfile67
+type::t
+procedure(foo),nopass,pointer::p
+end type
+contains
+pure function foo(n,a) result(r)
+integer(4),intent(in)::n
+real(4),intent(in)::a(1_8:int(n,kind=8))
+logical(4)::r(1_8:int(int(max(0_8,int(n,kind=8)),kind=4),kind=8))
+end
+function fooptr(f)
+procedure(foo)::f
+type(t)::fooptr
+end
+end
diff --git a/flang/test/Semantics/modfile67.f90 b/flang/test/Semantics/modfile67.f90
new file mode 100644
index 0000000000000..18cf95bd42fbf
--- /dev/null
+++ b/flang/test/Semantics/modfile67.f90
@@ -0,0 +1,35 @@
+!RUN: %flang_fc1 -fsyntax-only -J%S/Inputs %s
+
+#if 0
+!modfile67.mod was produced from this source, and must be read into this
+!compilation from its module file in order to truly test this fix.
+module modfile67
+  type t
+    procedure(foo), nopass, pointer :: p
+  end type
+ contains
+  pure function foo(n,a) result(r)
+    integer, intent(in) :: n
+    real, intent(in), dimension(n) :: a
+    logical, dimension(size(a)) :: r
+    r = .false.
+  end
+  type(t) function fooptr(f)
+    procedure(foo) f
+    fooptr%p => f
+  end
+end
+#endif
+
+program test
+  use modfile67
+  type(t) x
+  x = fooptr(bar) ! ensure no bogus error about procedure incompatibility
+ contains
+  pure function bar(n,a) result(r)
+    integer, intent(in) :: n
+    real, intent(in), dimension(n) :: a
+    logical, dimension(size(a)) :: r
+    r = .false.
+  end
+end

From fe58527305d86df8bd9770f3d41a6de420958af7 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 14:11:37 -0700
Subject: [PATCH 016/114] [flang] Relax ETIME(VALUES=) runtime checking
 (#107647)

Don't require the "VALUES=" argument to the extension intrinsic
procedure ETIME to have exactly two elements. Other compilers that
support ETIME do not, and it's easy to adapt the behavior to whatever
the dynamic size turns out to be.
---
 flang/runtime/time-intrinsic.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/flang/runtime/time-intrinsic.cpp b/flang/runtime/time-intrinsic.cpp
index 7e590eabf3966..e6f6e81c7b50c 100644
--- a/flang/runtime/time-intrinsic.cpp
+++ b/flang/runtime/time-intrinsic.cpp
@@ -490,16 +490,20 @@ void RTNAME(Etime)(const Descriptor *values, const Descriptor *time,
     auto typeCode{values->type().GetCategoryAndKind()};
     // ETIME values argument must have decimal range == 2.
     RUNTIME_CHECK(terminator,
-        values->rank() == 1 && values->GetDimension(0).Extent() == 2 &&
-            typeCode && typeCode->first == Fortran::common::TypeCategory::Real);
+        values->rank() == 1 && typeCode &&
+            typeCode->first == Fortran::common::TypeCategory::Real);
     // Only accept KIND=4 here.
     int kind{typeCode->second};
     RUNTIME_CHECK(terminator, kind == 4);
-
-    ApplyFloatingPointKind<StoreFloatingPointAt, void>(
-        kind, terminator, *values, /* atIndex = */ 0, usrTime);
-    ApplyFloatingPointKind<StoreFloatingPointAt, void>(
-        kind, terminator, *values, /* atIndex = */ 1, sysTime);
+    auto extent{values->GetDimension(0).Extent()};
+    if (extent >= 1) {
+      ApplyFloatingPointKind<StoreFloatingPointAt, void>(
+          kind, terminator, *values, /* atIndex = */ 0, usrTime);
+    }
+    if (extent >= 2) {
+      ApplyFloatingPointKind<StoreFloatingPointAt, void>(
+          kind, terminator, *values, /* atIndex = */ 1, sysTime);
+    }
   }
 
   if (time) {

From 26ac30bcec71ae97ba740fb6cf473eac3ac37887 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 14:13:09 -0700
Subject: [PATCH 017/114] [flang] Accept initialized SAVE local in
 specification expression (#107656)

Specification expressions may contain references to dummy arguments,
host objects, module variables, and variables in COMMON blocks, since
they will have values on entry to the scope. A local variable with a
initializer and the SAVE attribute (which will always be implied by an
explicit initialization) will also always work, and is accepted by at
least one other compiler, so accept it with a warning.
---
 flang/docs/Extensions.md                      |  3 +++
 flang/include/flang/Common/Fortran-features.h |  5 ++++-
 flang/lib/Evaluate/check-expression.cpp       | 17 +++++++++++++++--
 flang/test/Semantics/resolve69.f90            |  6 +++---
 flang/test/Semantics/resolve77.f90            |  1 +
 flang/test/Semantics/spec-expr.f90            |  2 +-
 6 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 340939baca99f..ed1ef49f8b77a 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -386,6 +386,9 @@ end
   probably by a C or C++ external definition.
 * An automatic data object may be declared in the specification part
   of the main program.
+* A local data object may appear in a specification expression, even
+  when it is not a dummy argument or in COMMON, so long as it is
+  has the SAVE attribute and was initialized.
 
 ### Extensions supported when enabled by options
 
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 0c8a3d2bd5281..86c6e02b0f2ff 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -51,7 +51,8 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize,
     NonBindCInteroperability, CudaManaged, CudaUnified,
     PolymorphicActualAllocatableOrPointerToMonomorphicDummy, RelaxedPureDummy,
-    UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram, PrintCptr)
+    UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram, PrintCptr,
+    SavedLocalInSpecExpr)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
@@ -146,6 +147,8 @@ class LanguageFeatureControl {
     warnUsage_.set(UsageWarning::VectorSubscriptFinalization);
     warnUsage_.set(UsageWarning::UndefinedFunctionResult);
     warnUsage_.set(UsageWarning::UselessIomsg);
+    // New warnings, on by default
+    warnLanguage_.set(LanguageFeature::SavedLocalInSpecExpr);
   }
   LanguageFeatureControl(const LanguageFeatureControl &) = default;
 
diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index fef4620857a08..8a90404db0456 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -554,6 +554,18 @@ class CheckSpecificationExprHelper
       }
     } else if (&symbol.owner() != &scope_ || &ultimate.owner() != &scope_) {
       return std::nullopt; // host association is in play
+    } else if (semantics::IsSaved(ultimate) &&
+        semantics::IsInitialized(ultimate) &&
+        context_.languageFeatures().IsEnabled(
+            common::LanguageFeature::SavedLocalInSpecExpr)) {
+      if (!scope_.IsModuleFile() &&
+          context_.languageFeatures().ShouldWarn(
+              common::LanguageFeature::SavedLocalInSpecExpr)) {
+        context_.messages().Say(
+            "specification expression refers to local object '%s' (initialized and saved)"_port_en_US,
+            ultimate.name().ToString());
+      }
+      return std::nullopt;
     } else if (const auto *object{
                    ultimate.detailsIf<semantics::ObjectEntityDetails>()}) {
       if (object->commonBlock()) {
@@ -781,8 +793,9 @@ bool CheckSpecificationExprHelper::IsPermissibleInquiry(
 template <typename A>
 void CheckSpecificationExpr(const A &x, const semantics::Scope &scope,
     FoldingContext &context, bool forElementalFunctionResult) {
-  if (auto why{CheckSpecificationExprHelper{
-          scope, context, forElementalFunctionResult}(x)}) {
+  CheckSpecificationExprHelper helper{
+      scope, context, forElementalFunctionResult};
+  if (auto why{helper(x)}) {
     context.messages().Say("Invalid specification expression%s: %s"_err_en_US,
         forElementalFunctionResult ? " for elemental function result" : "",
         *why);
diff --git a/flang/test/Semantics/resolve69.f90 b/flang/test/Semantics/resolve69.f90
index e1f7773eee9da..5acfd30604fe3 100644
--- a/flang/test/Semantics/resolve69.f90
+++ b/flang/test/Semantics/resolve69.f90
@@ -16,7 +16,7 @@ subroutine s1()
   !
   integer, parameter :: constVal = 1
   integer :: nonConstVal = 1
-!ERROR: Invalid specification expression: reference to local entity 'nonconstval'
+!PORTABILITY: specification expression refers to local object 'nonconstval' (initialized and saved)
   character(nonConstVal) :: colonString1
   character(len=20, kind=constVal + 1) :: constKindString
   character(len=:, kind=constVal + 1), pointer :: constKindString1
@@ -53,13 +53,13 @@ function foo3()
 
   type (derived(constVal, 3)) :: constDerivedKind
 !ERROR: Value of KIND type parameter 'typekind' must be constant
-!ERROR: Invalid specification expression: reference to local entity 'nonconstval'
+!PORTABILITY: specification expression refers to local object 'nonconstval' (initialized and saved)
   type (derived(nonConstVal, 3)) :: nonConstDerivedKind
 
   !OK because all type-params are constants
   type (derived(3, constVal)) :: constDerivedLen
 
-!ERROR: Invalid specification expression: reference to local entity 'nonconstval'
+!PORTABILITY: specification expression refers to local object 'nonconstval' (initialized and saved)
   type (derived(3, nonConstVal)) :: nonConstDerivedLen
 !ERROR: 'colonderivedlen' has a type derived(typekind=3_4,typelen=:) with a deferred type parameter but is neither an allocatable nor an object pointer
   type (derived(3, :)) :: colonDerivedLen
diff --git a/flang/test/Semantics/resolve77.f90 b/flang/test/Semantics/resolve77.f90
index 943993ee74d76..0133fac3bfbc5 100644
--- a/flang/test/Semantics/resolve77.f90
+++ b/flang/test/Semantics/resolve77.f90
@@ -60,6 +60,7 @@ pure integer function if2(n)
 block data
   common /blk2/ n
   data n/100/
+  !PORTABILITY: specification expression refers to local object 'n' (initialized and saved)
   !ERROR: Automatic data object 'a' may not appear in a BLOCK DATA subprogram
   real a(n)
 end
diff --git a/flang/test/Semantics/spec-expr.f90 b/flang/test/Semantics/spec-expr.f90
index aa010ed0bf7ed..9d209c3583b43 100644
--- a/flang/test/Semantics/spec-expr.f90
+++ b/flang/test/Semantics/spec-expr.f90
@@ -104,7 +104,7 @@ subroutine s7biii(x, y)
   integer :: local = 5
   ! OK, since "localConst" is a constant
   real, dimension(localConst) :: realArray1
-  !ERROR: Invalid specification expression: reference to local entity 'local'
+  !PORTABILITY: specification expression refers to local object 'local' (initialized and saved)
   real, dimension(local) :: realArray2
   real, dimension(size(realArray1)) :: realArray3 ! ok
   real, dimension(size(x)) :: realArray4 ! ok

From cd92c4255582299b9a55fa0dc485982b8f54c49a Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 14:13:28 -0700
Subject: [PATCH 018/114] [flang][runtime] Don't emit runtime error for "AA"
 editing (#107714)

Commas are optional between edit descriptors in a format, so treat "AA"
as if it were "A,A".
---
 flang/runtime/format-implementation.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h
index 74254bebe6e7a..46204ca927c13 100644
--- a/flang/runtime/format-implementation.h
+++ b/flang/runtime/format-implementation.h
@@ -443,8 +443,9 @@ RT_API_ATTRS int FormatControl<CONTEXT>::CueUpNextDataEdit(
       if (ch != 'P') { // 1PE5.2 - comma not required (C1302)
         CharType peek{Capitalize(PeekNext())};
         if (peek >= 'A' && peek <= 'Z') {
-          if (ch == 'A' /* anticipate F'202X AT editing */ || ch == 'B' ||
-              ch == 'D' || ch == 'E' || ch == 'R' || ch == 'S' || ch == 'T') {
+          if ((ch == 'A' && peek == 'T' /* anticipate F'202X AT editing */) ||
+              ch == 'B' || ch == 'D' || ch == 'E' || ch == 'R' || ch == 'S' ||
+              ch == 'T') {
             // Assume a two-letter edit descriptor
             next = peek;
             ++offset_;

From ea858e39bf5b1d09021d142f0c82ef1d4a82d367 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 14:13:47 -0700
Subject: [PATCH 019/114] [flang][runtime] Accept '\n' as space in internal
 list-directed input (#107716)

When scanning ahead for the first character in the next input item in
list-directed internal input, allow a newline character to appear and
treat it as a space, matching the behavior of nearly all other Fortran
compilers.
---
 flang/runtime/io-stmt.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index d67d1ec80afce..2e0ca46078ecd 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -194,8 +194,9 @@ class IoStatementState {
       std::size_t &byteCount) {
     auto ch{GetCurrentChar(byteCount)};
     bool inNamelist{mutableModes().inNamelist};
-    while (!ch || *ch == ' ' || *ch == '\t' || (inNamelist && *ch == '!')) {
-      if (ch && (*ch == ' ' || *ch == '\t')) {
+    while (!ch || *ch == ' ' || *ch == '\t' || *ch == '\n' ||
+        (inNamelist && *ch == '!')) {
+      if (ch && (*ch == ' ' || *ch == '\t' || *ch == '\n')) {
         HandleRelativePosition(byteCount);
       } else if (!AdvanceRecord()) {
         return Fortran::common::nullopt;

From 15106c26662a573df31e8dfdd9350c313b8bfd84 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 14:14:08 -0700
Subject: [PATCH 020/114] [flang][runtime] Fix odd "invalid descriptor" runtime
 crash (#107785)

A defined assignment generic interface for a given LHS/RHS type & rank
combination may have a specific procedure with LHS dummy argument that
is neither allocatable nor pointer, or specific procedure(s) whose LHS
dummy arguments are allocatable or pointer. It is possible to have two
specific procedures if one's LHS dummy argument is allocatable and the
other's is pointer.

However, the runtime doesn't work with LHS dummy arguments that are
allocatable, and will crash with a mysterious "invalid descriptor" error
message.

Extend the list of special bindings to include
ScalarAllocatableAssignment and ScalarPointerAssignment, use them when
appropriate in the runtime type information tables, and handle them in
Assign() in the runtime support library.
---
 flang/lib/Semantics/expression.cpp        |  3 +-
 flang/lib/Semantics/runtime-type-info.cpp | 21 +++++++--
 flang/module/__fortran_type_info.f90      | 13 +++---
 flang/runtime/assign.cpp                  | 16 +++++--
 flang/runtime/descriptor-io.h             |  2 +-
 flang/runtime/namelist.cpp                |  4 +-
 flang/runtime/type-info.cpp               |  9 +++-
 flang/runtime/type-info.h                 | 22 +++++-----
 flang/test/Semantics/typeinfo01.f90       | 18 ++++----
 flang/test/Semantics/typeinfo02.f90       |  4 +-
 flang/test/Semantics/typeinfo04.f90       |  2 +-
 flang/test/Semantics/typeinfo12.f90       | 52 +++++++++++++++++++++++
 12 files changed, 126 insertions(+), 40 deletions(-)
 create mode 100644 flang/test/Semantics/typeinfo12.f90

diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 0eabe532cfe0c..a5b5d48e2bfee 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -4605,7 +4605,8 @@ std::optional<ProcedureRef> ArgumentAnalyzer::GetDefinedAssignmentProc() {
     }
     for (std::size_t i{0}; !proc && i < actuals_.size(); ++i) {
       const Symbol *generic{nullptr};
-      if (const Symbol *binding{FindBoundOp(oprName, i, generic, true)}) {
+      if (const Symbol *
+          binding{FindBoundOp(oprName, i, generic, /*isSubroutine=*/true)}) {
         if (CheckAccessibleSymbol(scope, DEREF(generic))) {
           // ignore inaccessible type-bound ASSIGNMENT(=) generic
         } else if (const Symbol *
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 9f3eb5fbe11a1..427a8421aeaf9 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -149,6 +149,10 @@ class RuntimeTableBuilder {
   SomeExpr explicitEnum_; // Value::Genre::Explicit
   SomeExpr lenParameterEnum_; // Value::Genre::LenParameter
   SomeExpr scalarAssignmentEnum_; // SpecialBinding::Which::ScalarAssignment
+  SomeExpr
+      scalarAllocatableAssignmentEnum_; // SpecialBinding::Which::ScalarAllocatableAssignment
+  SomeExpr
+      scalarPointerAssignmentEnum_; // SpecialBinding::Which::ScalarPointerAssignment
   SomeExpr
       elementalAssignmentEnum_; // SpecialBinding::Which::ElementalAssignment
   SomeExpr readFormattedEnum_; // SpecialBinding::Which::ReadFormatted
@@ -174,6 +178,9 @@ RuntimeTableBuilder::RuntimeTableBuilder(
       explicitEnum_{GetEnumValue("explicit")},
       lenParameterEnum_{GetEnumValue("lenparameter")},
       scalarAssignmentEnum_{GetEnumValue("scalarassignment")},
+      scalarAllocatableAssignmentEnum_{
+          GetEnumValue("scalarallocatableassignment")},
+      scalarPointerAssignmentEnum_{GetEnumValue("scalarpointerassignment")},
       elementalAssignmentEnum_{GetEnumValue("elementalassignment")},
       readFormattedEnum_{GetEnumValue("readformatted")},
       readUnformattedEnum_{GetEnumValue("readunformatted")},
@@ -1122,10 +1129,10 @@ void RuntimeTableBuilder::DescribeSpecialProc(
       // Non-type-bound generic INTERFACEs and assignments from distinct
       // types must not be used for component intrinsic assignment.
       CHECK(proc->dummyArguments.size() == 2);
-      const auto t1{
+      const auto &ddo1{
           DEREF(std::get_if<evaluate::characteristics::DummyDataObject>(
-                    &proc->dummyArguments[0].u))
-              .type.type()};
+              &proc->dummyArguments[0].u))};
+      const auto t1{ddo1.type.type()};
       const auto t2{
           DEREF(std::get_if<evaluate::characteristics::DummyDataObject>(
                     &proc->dummyArguments[1].u))
@@ -1137,7 +1144,13 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         return;
       }
       which = proc->IsElemental() ? elementalAssignmentEnum_
-                                  : scalarAssignmentEnum_;
+          : ddo1.attrs.test(
+                evaluate::characteristics::DummyDataObject::Attr::Allocatable)
+          ? scalarAllocatableAssignmentEnum_
+          : ddo1.attrs.test(
+                evaluate::characteristics::DummyDataObject::Attr::Pointer)
+          ? scalarPointerAssignmentEnum_
+          : scalarAssignmentEnum_;
       if (binding && binding->passName() &&
           *binding->passName() == proc->dummyArguments[1].name) {
         argThatMightBeDescriptor = 1;
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index 5f2273de1e3d1..7dfcfe71fcb32 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -106,11 +106,14 @@
   end type
 
   enum, bind(c) ! SpecialBinding::Which
-    enumerator :: ScalarAssignment = 1, ElementalAssignment = 2
-    enumerator :: ReadFormatted = 3, ReadUnformatted = 4
-    enumerator :: WriteFormatted = 5, WriteUnformatted = 6
-    enumerator :: ElementalFinal = 7, AssumedRankFinal = 8
-    enumerator :: ScalarFinal = 9 ! higher-rank final procedures follow
+    enumerator :: ScalarAssignment = 1
+    enumerator :: ScalarAllocatableAssignment = 2
+    enumerator :: ScalarPointerAssignment = 3
+    enumerator :: ElementalAssignment = 4
+    enumerator :: ReadFormatted = 5, ReadUnformatted = 6
+    enumerator :: WriteFormatted = 7, WriteUnformatted = 8
+    enumerator :: ElementalFinal = 9, AssumedRankFinal = 10
+    enumerator :: ScalarFinal = 11 ! higher-rank final procedures follow
   end enum
 
   type, bind(c) :: SpecialBinding
diff --git a/flang/runtime/assign.cpp b/flang/runtime/assign.cpp
index d558ada51cd21..166cf54778921 100644
--- a/flang/runtime/assign.cpp
+++ b/flang/runtime/assign.cpp
@@ -352,6 +352,17 @@ RT_API_ATTRS static void Assign(
     // the Assign() is invoked recursively for component-per-component
     // assignments.
     if (to.rank() == 0) {
+      if (to.IsAllocatable()) {
+        if (const auto *special{toDerived->FindSpecialBinding(typeInfo::
+                    SpecialBinding::Which::ScalarAllocatableAssignment)}) {
+          return DoScalarDefinedAssignment(to, from, *special);
+        }
+      } else if (to.IsPointer()) {
+        if (const auto *special{toDerived->FindSpecialBinding(
+                typeInfo::SpecialBinding::Which::ScalarPointerAssignment)}) {
+          return DoScalarDefinedAssignment(to, from, *special);
+        }
+      }
       if (const auto *special{toDerived->FindSpecialBinding(
               typeInfo::SpecialBinding::Which::ScalarAssignment)}) {
         return DoScalarDefinedAssignment(to, from, *special);
@@ -417,9 +428,8 @@ RT_API_ATTRS static void Assign(
             StaticDescriptor<maxRank, true, 10 /*?*/> statDesc[2];
             Descriptor &toCompDesc{statDesc[0].descriptor()};
             Descriptor &fromCompDesc{statDesc[1].descriptor()};
-            comp.CreatePointerDescriptor(toCompDesc, to, terminator, toAt);
-            comp.CreatePointerDescriptor(
-                fromCompDesc, from, terminator, fromAt);
+            comp.CreateTargetDescriptor(toCompDesc, to, terminator, toAt);
+            comp.CreateTargetDescriptor(fromCompDesc, from, terminator, fromAt);
             Assign(toCompDesc, fromCompDesc, terminator, nestedFlags);
           } else { // Component has intrinsic type; simply copy raw bytes
             std::size_t componentByteSize{comp.SizeInBytes(to)};
diff --git a/flang/runtime/descriptor-io.h b/flang/runtime/descriptor-io.h
index ff5f683c6da52..66158b4076164 100644
--- a/flang/runtime/descriptor-io.h
+++ b/flang/runtime/descriptor-io.h
@@ -255,7 +255,7 @@ static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io,
     // Create a descriptor for the component
     StaticDescriptor<maxRank, true, 16 /*?*/> statDesc;
     Descriptor &desc{statDesc.descriptor()};
-    component.CreatePointerDescriptor(
+    component.CreateTargetDescriptor(
         desc, origDescriptor, terminator, origSubscripts);
     return DescriptorIO<DIR>(io, desc, table);
   } else {
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index af092de70f781..fe26a0d3a6e89 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -362,7 +362,7 @@ static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc,
             io.HandleRelativePosition(byteCount); // skip over '('
             StaticDescriptor<maxRank, true, 16> staticDesc;
             Descriptor &tmpDesc{staticDesc.descriptor()};
-            comp->CreatePointerDescriptor(tmpDesc, source, handler);
+            comp->CreateTargetDescriptor(tmpDesc, source, handler);
             if (!HandleSubscripts(io, desc, tmpDesc, compName)) {
               return false;
             }
@@ -370,7 +370,7 @@ static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc,
           }
         }
         if (!createdDesc) {
-          comp->CreatePointerDescriptor(desc, source, handler);
+          comp->CreateTargetDescriptor(desc, source, handler);
         }
         if (source.rank() > 0) {
           if (desc.rank() > 0) {
diff --git a/flang/runtime/type-info.cpp b/flang/runtime/type-info.cpp
index cb18c5669b5ff..531944086c7f7 100644
--- a/flang/runtime/type-info.cpp
+++ b/flang/runtime/type-info.cpp
@@ -134,7 +134,7 @@ RT_API_ATTRS void Component::EstablishDescriptor(Descriptor &descriptor,
   }
 }
 
-RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor,
+RT_API_ATTRS void Component::CreateTargetDescriptor(Descriptor &descriptor,
     const Descriptor &container, Terminator &terminator,
     const SubscriptValue *subscripts) const {
   RUNTIME_CHECK(terminator, genre_ == Genre::Data);
@@ -144,7 +144,6 @@ RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor,
   } else {
     descriptor.set_base_addr(container.OffsetElement<char>() + offset_);
   }
-  descriptor.raw().attribute = CFI_attribute_pointer;
 }
 
 RT_API_ATTRS const DerivedType *DerivedType::GetParentType() const {
@@ -297,6 +296,12 @@ FILE *SpecialBinding::Dump(FILE *f) const {
   case Which::ScalarAssignment:
     std::fputs("    ScalarAssignment", f);
     break;
+  case Which::ScalarAllocatableAssignment:
+    std::fputs("    ScalarAllocatableAssignment", f);
+    break;
+  case Which::ScalarPointerAssignment:
+    std::fputs("    ScalarPointerAssignment", f);
+    break;
   case Which::ElementalAssignment:
     std::fputs("    ElementalAssignment", f);
     break;
diff --git a/flang/runtime/type-info.h b/flang/runtime/type-info.h
index c3f3595e32ef2..340971bfacf3e 100644
--- a/flang/runtime/type-info.h
+++ b/flang/runtime/type-info.h
@@ -89,9 +89,9 @@ class Component {
   RT_API_ATTRS void EstablishDescriptor(
       Descriptor &, const Descriptor &container, Terminator &) const;
 
-  // Creates a pointer descriptor from this component description, possibly
+  // Creates a descriptor from this component description, possibly
   // with subscripts
-  RT_API_ATTRS void CreatePointerDescriptor(Descriptor &,
+  RT_API_ATTRS void CreateTargetDescriptor(Descriptor &,
       const Descriptor &container, Terminator &,
       const SubscriptValue * = nullptr) const;
 
@@ -126,14 +126,16 @@ class SpecialBinding {
   enum class Which : std::uint8_t {
     None = 0,
     ScalarAssignment = 1,
-    ElementalAssignment = 2,
-    ReadFormatted = 3,
-    ReadUnformatted = 4,
-    WriteFormatted = 5,
-    WriteUnformatted = 6,
-    ElementalFinal = 7,
-    AssumedRankFinal = 8,
-    ScalarFinal = 9,
+    ScalarAllocatableAssignment = 2,
+    ScalarPointerAssignment = 3,
+    ElementalAssignment = 4,
+    ReadFormatted = 5,
+    ReadUnformatted = 6,
+    WriteFormatted = 7,
+    WriteUnformatted = 8,
+    ElementalFinal = 9,
+    AssumedRankFinal = 10,
+    ScalarFinal = 11,
     // higher-ranked final procedures follow
   };
 
diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90
index 0d381f10b0483..b6f0e2e12ff6f 100644
--- a/flang/test/Semantics/typeinfo01.f90
+++ b/flang/test/Semantics/typeinfo01.f90
@@ -102,8 +102,8 @@ impure elemental subroutine s1(x, y)
     class(t), intent(out) :: x
     class(t), intent(in) :: y
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=16_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=4_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 end module
 
@@ -125,8 +125,8 @@ impure elemental subroutine s3(x)
   subroutine s4(x)
     type(t), contiguous :: x(:,:,:)
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=29184_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=9_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=13_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=14_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)]
 end module
 
 module m09
@@ -167,8 +167,8 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=480_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=7_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=8_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)]
 end module
 
@@ -216,8 +216,8 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=480_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=8_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
 end module
 
 module m11
@@ -260,7 +260,7 @@ module m13
    contains
     procedure :: assign1, assign2
     generic :: assignment(=) => assign1, assign2
-    ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=assign1)]
+    ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=4_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=assign1)]
   end type
  contains
   impure elemental subroutine assign1(to, from)
diff --git a/flang/test/Semantics/typeinfo02.f90 b/flang/test/Semantics/typeinfo02.f90
index 29d14c7a0f196..2b911e7238f88 100644
--- a/flang/test/Semantics/typeinfo02.f90
+++ b/flang/test/Semantics/typeinfo02.f90
@@ -29,5 +29,5 @@ subroutine wf2(x,u,iot,v,iostat,iomsg)
     character(len=*), intent(inout) :: iomsg
   end subroutine
 end module
-!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf1)]
-!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf2)]
+!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf1)]
+!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf2)]
diff --git a/flang/test/Semantics/typeinfo04.f90 b/flang/test/Semantics/typeinfo04.f90
index de8464321a409..2527f656da3d1 100644
--- a/flang/test/Semantics/typeinfo04.f90
+++ b/flang/test/Semantics/typeinfo04.f90
@@ -7,7 +7,7 @@ module m
    contains
     final :: final
   end type
-!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=512_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
   type, abstract :: t1
   end type
 !CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
diff --git a/flang/test/Semantics/typeinfo12.f90 b/flang/test/Semantics/typeinfo12.f90
new file mode 100644
index 0000000000000..983e09be0055b
--- /dev/null
+++ b/flang/test/Semantics/typeinfo12.f90
@@ -0,0 +1,52 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s
+
+! Test defined assignment with allocatable / pointer LHS arguments.
+! The special bindings for the defined assignments must reflect that
+! their LHS arguments are allocatables and pointers.
+! (This program is executable and should print 1; 102; 3 204.)
+
+module m
+  type :: base
+     integer :: i
+  contains
+     procedure, pass(src) :: ass1, ass2
+     generic :: assignment(=) => ass1, ass2
+  end type base
+  type, extends(base) :: derived
+  end type
+
+!CHECK: .dt.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.base,name=.n.base,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.base,procptr=NULL(),special=.s.base,specialbitset=12_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.derived, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.derived,name=.n.derived,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.derived,procptr=NULL(),special=.s.derived,specialbitset=12_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:1_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass1),specialbinding(which=3_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass2)]
+!CHECK: .s.derived, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:1_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass1),specialbinding(which=3_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass2)]
+
+contains
+  subroutine ass1(res, src)
+    class(base), allocatable, intent(out) :: res
+    class(base), intent(in) :: src
+    allocate(res, source=src)
+    res%i = res%i + 100
+  end subroutine
+  subroutine ass2(res, src)
+    class(base), pointer, intent(in out) :: res
+    class(base), intent(in) :: src
+    allocate(res, source=src)
+    res%i = src%i + 200
+  end subroutine
+end
+program genext
+  use m
+  type(derived) :: od1
+  class(base), allocatable :: od2
+  class(base), pointer :: od3a, od3b
+  od1 = derived(1)
+  print *, od1%i
+  od2 = derived(2)
+  print *, od2%i
+  allocate(od3a)
+  od3a%i = 3
+  od3b => od3a
+  od3b = derived(4)
+  print *, od3a%i, od3b%i
+end program genext

From 37f94cd99a5bc4b186651d6967d8595c4786d8ed Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 14:14:33 -0700
Subject: [PATCH 021/114] [flang] Accept KIND(x) when x is assumed-rank
 (#107787)

Don't emit a bogus error about being unable to forward an assumed-rank
dummy argument as an actual argument in the case of the KIND intrinsic
function.

Fixes https://github.com/llvm/llvm-project/issues/107782.
---
 flang/lib/Evaluate/intrinsics.cpp              | 2 +-
 flang/test/Evaluate/fold-assumed-rank-kind.f90 | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Evaluate/fold-assumed-rank-kind.f90

diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index ebe946ac60ccb..876c2aed4ffd6 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -587,7 +587,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
     {"izext", {{"i", AnyInt}}, TypePattern{IntType, KindCode::exactKind, 2}},
     {"jzext", {{"i", AnyInt}}, DefaultInt},
     {"kind",
-        {{"x", AnyIntrinsic, Rank::elemental, Optionality::required,
+        {{"x", AnyIntrinsic, Rank::anyOrAssumedRank, Optionality::required,
             common::Intent::In, {ArgFlag::canBeMoldNull}}},
         DefaultInt, Rank::elemental, IntrinsicClass::inquiryFunction},
     {"lbound",
diff --git a/flang/test/Evaluate/fold-assumed-rank-kind.f90 b/flang/test/Evaluate/fold-assumed-rank-kind.f90
new file mode 100644
index 0000000000000..674f60c6a0e2f
--- /dev/null
+++ b/flang/test/Evaluate/fold-assumed-rank-kind.f90
@@ -0,0 +1,6 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+subroutine subr(ar)
+  real(8) :: ar(..)
+!CHECK:  PRINT *, 8_4
+  print *, kind(ar)
+end

From d418a03e01e6a31b51b0c9dd42ba46da6c47f89d Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 14:15:20 -0700
Subject: [PATCH 022/114] [flang] Fix error from semantics on use associated
 procedure pointer (#107928)

Use associated procedure pointers were eliciting bogus errors from
semantics if their modules also contained generic procedure interfaces
of the same name. (The compiler handles this case correctly when the
specific procedure of the same name is not a pointer.)

With this fix, the test case in
  https://github.com/llvm/llvm-project/issues/107784
no longer experiences semantic errors; however, it now crashes
unexpectedly in lowering.
---
 flang/include/flang/Semantics/scope.h   |  2 +
 flang/lib/Semantics/compute-offsets.cpp |  7 +++
 flang/lib/Semantics/expression.cpp      |  7 ++-
 flang/lib/Semantics/resolve-names.cpp   | 69 +++++++++++++++++++++----
 flang/lib/Semantics/symbol.cpp          |  3 +-
 flang/test/Semantics/generic10.f90      | 17 ++++++
 6 files changed, 92 insertions(+), 13 deletions(-)
 create mode 100644 flang/test/Semantics/generic10.f90

diff --git a/flang/include/flang/Semantics/scope.h b/flang/include/flang/Semantics/scope.h
index a58163f5460c2..e73a507e9b3f5 100644
--- a/flang/include/flang/Semantics/scope.h
+++ b/flang/include/flang/Semantics/scope.h
@@ -138,6 +138,8 @@ class Scope {
   const_iterator cend() const { return symbols_.cend(); }
 
   // Return symbols in declaration order (the iterators above are in name order)
+  // When a generic procedure interface shadows a derived type or specific
+  // procedure, only the generic's symbol appears in the output.
   SymbolVector GetSymbols() const;
   MutableSymbolVector GetSymbols();
 
diff --git a/flang/lib/Semantics/compute-offsets.cpp b/flang/lib/Semantics/compute-offsets.cpp
index d9a9576e9d676..b5a58ddca0ecd 100644
--- a/flang/lib/Semantics/compute-offsets.cpp
+++ b/flang/lib/Semantics/compute-offsets.cpp
@@ -114,6 +114,13 @@ void ComputeOffsetsHelper::Compute(Scope &scope) {
         dependents_.find(symbol) == dependents_.end() &&
         equivalenceBlock_.find(symbol) == equivalenceBlock_.end()) {
       DoSymbol(*symbol);
+      if (auto *generic{symbol->detailsIf<GenericDetails>()}) {
+        if (Symbol * specific{generic->specific()};
+            specific && !FindCommonBlockContaining(*specific)) {
+          // might be a shadowed procedure pointer
+          DoSymbol(*specific);
+        }
+      }
     }
   }
   // Ensure that the size is a multiple of the alignment
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index a5b5d48e2bfee..943512f75d7eb 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -210,7 +210,8 @@ class ArgumentAnalyzer {
 // or procedure pointer reference in a ProcedureDesignator.
 MaybeExpr ExpressionAnalyzer::Designate(DataRef &&ref) {
   const Symbol &last{ref.GetLastSymbol()};
-  const Symbol &symbol{BypassGeneric(last).GetUltimate()};
+  const Symbol &specific{BypassGeneric(last)};
+  const Symbol &symbol{specific.GetUltimate()};
   if (semantics::IsProcedure(symbol)) {
     if (symbol.attrs().test(semantics::Attr::ABSTRACT)) {
       Say("Abstract procedure interface '%s' may not be used as a designator"_err_en_US,
@@ -226,6 +227,10 @@ MaybeExpr ExpressionAnalyzer::Designate(DataRef &&ref) {
     } else if (!symbol.attrs().test(semantics::Attr::INTRINSIC)) {
       if (symbol.has<semantics::GenericDetails>()) {
         Say("'%s' is not a specific procedure"_err_en_US, last.name());
+      } else if (IsProcedurePointer(specific)) {
+        // For procedure pointers, retain associations so that data accesses
+        // from client modules will work.
+        return Expr<SomeType>{ProcedureDesignator{specific}};
       } else {
         return Expr<SomeType>{ProcedureDesignator{symbol}};
       }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index b764678357db3..d8f601212d8d0 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -618,6 +618,20 @@ class ScopeHandler : public ImplicitRulesVisitor {
           return *derivedType;
         }
       }
+    } else if constexpr (std::is_same_v<ProcEntityDetails, D>) {
+      if (auto *d{symbol->detailsIf<GenericDetails>()}) {
+        if (!d->derivedType()) {
+          // procedure pointer with same name as a generic
+          auto *specific{d->specific()};
+          if (!specific) {
+            specific = &currScope().MakeSymbol(name, attrs, std::move(details));
+            d->set_specific(*specific);
+          } else {
+            SayAlreadyDeclared(name, *specific);
+          }
+          return *specific;
+        }
+      }
     }
     if (symbol->CanReplaceDetails(details)) {
       // update the existing symbol
@@ -3035,14 +3049,26 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
     return;
   }
   const Symbol &useUltimate{useSymbol.GetUltimate()};
+  const auto *useGeneric{useUltimate.detailsIf<GenericDetails>()};
   if (localSymbol->has<UnknownDetails>()) {
-    localSymbol->set_details(UseDetails{localName, useSymbol});
-    localSymbol->attrs() =
-        useSymbol.attrs() & ~Attrs{Attr::PUBLIC, Attr::PRIVATE, Attr::SAVE};
-    localSymbol->implicitAttrs() =
-        localSymbol->attrs() & Attrs{Attr::ASYNCHRONOUS, Attr::VOLATILE};
-    localSymbol->flags() = useSymbol.flags();
-    return;
+    if (useGeneric && useGeneric->specific() &&
+        IsProcedurePointer(*useGeneric->specific())) {
+      // We are use-associating a generic that shadows a procedure pointer.
+      // Local references that might be made to that procedure pointer should
+      // use a UseDetails symbol for proper data addressing.  So create an
+      // empty local generic now into which the use-associated generic may
+      // be copied.
+      localSymbol->set_details(GenericDetails{});
+      localSymbol->get<GenericDetails>().set_kind(useGeneric->kind());
+    } else { // just create UseDetails
+      localSymbol->set_details(UseDetails{localName, useSymbol});
+      localSymbol->attrs() =
+          useSymbol.attrs() & ~Attrs{Attr::PUBLIC, Attr::PRIVATE, Attr::SAVE};
+      localSymbol->implicitAttrs() =
+          localSymbol->attrs() & Attrs{Attr::ASYNCHRONOUS, Attr::VOLATILE};
+      localSymbol->flags() = useSymbol.flags();
+      return;
+    }
   }
 
   Symbol &localUltimate{localSymbol->GetUltimate()};
@@ -3066,10 +3092,7 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
   //   - anything other than a derived type, non-generic procedure, or
   //     generic procedure being combined with something other than an
   //     prior USE association of itself
-
   auto *localGeneric{localUltimate.detailsIf<GenericDetails>()};
-  const auto *useGeneric{useUltimate.detailsIf<GenericDetails>()};
-
   Symbol *localDerivedType{nullptr};
   if (localUltimate.has<DerivedTypeDetails>()) {
     localDerivedType = &localUltimate;
@@ -3261,6 +3284,15 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
   // At this point, there must be at least one generic interface.
   CHECK(localGeneric || (useGeneric && (localDerivedType || localProcedure)));
 
+  // Ensure that a use-associated specific procedure that is a procedure
+  // pointer is properly represented as a USE association of an entity.
+  if (IsProcedurePointer(useProcedure)) {
+    Symbol &combined{currScope().MakeSymbol(localSymbol->name(),
+        useProcedure->attrs(), UseDetails{localName, *useProcedure})};
+    combined.flags() |= useProcedure->flags();
+    combinedProcedure = &combined;
+  }
+
   if (localGeneric) {
     // Create a local copy of a previously use-associated generic so that
     // it can be locally extended without corrupting the original.
@@ -5079,7 +5111,22 @@ bool DeclarationVisitor::HasCycle(
 
 Symbol &DeclarationVisitor::DeclareProcEntity(
     const parser::Name &name, Attrs attrs, const Symbol *interface) {
-  Symbol &symbol{DeclareEntity<ProcEntityDetails>(name, attrs)};
+  Symbol *proc{nullptr};
+  if (auto *extant{FindInScope(name)}) {
+    if (auto *d{extant->detailsIf<GenericDetails>()}; d && !d->derivedType()) {
+      // procedure pointer with same name as a generic
+      if (auto *specific{d->specific()}) {
+        SayAlreadyDeclared(name, *specific);
+      } else {
+        // Create the ProcEntityDetails symbol in the scope as the "specific()"
+        // symbol behind an existing GenericDetails symbol of the same name.
+        proc = &Resolve(name,
+            currScope().MakeSymbol(name.source, attrs, ProcEntityDetails{}));
+        d->set_specific(*proc);
+      }
+    }
+  }
+  Symbol &symbol{proc ? *proc : DeclareEntity<ProcEntityDetails>(name, attrs)};
   if (auto *details{symbol.detailsIf<ProcEntityDetails>()}) {
     if (context().HasError(symbol)) {
     } else if (HasCycle(symbol, interface)) {
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index b593bf89b18bc..14d6564664f2c 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -210,8 +210,9 @@ const Symbol *GenericDetails::CheckSpecific() const {
 }
 Symbol *GenericDetails::CheckSpecific() {
   if (specific_ && !specific_->has<UseErrorDetails>()) {
+    const Symbol &ultimate{specific_->GetUltimate()};
     for (const Symbol &proc : specificProcs_) {
-      if (&proc == specific_) {
+      if (&proc.GetUltimate() == &ultimate) {
         return nullptr;
       }
     }
diff --git a/flang/test/Semantics/generic10.f90 b/flang/test/Semantics/generic10.f90
new file mode 100644
index 0000000000000..203d0bb855208
--- /dev/null
+++ b/flang/test/Semantics/generic10.f90
@@ -0,0 +1,17 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+module m
+  procedure(func), pointer :: foo
+  interface foo
+     procedure :: foo
+  end interface
+ contains
+  function func(x)
+    func = x
+  end
+end
+
+program main
+  use m
+!CHECK: foo => func
+  foo => func
+end

From 5a2071b184e00f086f5b538f2209bcdb8aba3078 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Tue, 10 Sep 2024 14:20:32 -0700
Subject: [PATCH 023/114] [compiler-rt][rtsan] Improve error message wording to
 match ASan style (#107620)

---
 compiler-rt/lib/rtsan/rtsan_context.cpp            | 9 +++++----
 compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h | 7 ++++---
 compiler-rt/test/rtsan/basic.cpp                   | 3 ++-
 compiler-rt/test/rtsan/disabler.cpp                | 2 +-
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/compiler-rt/lib/rtsan/rtsan_context.cpp b/compiler-rt/lib/rtsan/rtsan_context.cpp
index a49b70360babb..8609394fa222f 100644
--- a/compiler-rt/lib/rtsan/rtsan_context.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_context.cpp
@@ -95,10 +95,11 @@ void __rtsan::PrintDiagnostics(const char *intercepted_function_name, uptr pc,
                                uptr bp) {
   ScopedErrorReportLock l;
 
-  fprintf(stderr,
-          "Real-time violation: intercepted call to real-time unsafe function "
-          "`%s` in real-time context! Stack trace:\n",
-          intercepted_function_name);
+  Report("ERROR: RealtimeSanitizer: unsafe-library-call\n");
+  Printf("Intercepted call to real-time unsafe function "
+         "`%s` in real-time context!\n",
+         intercepted_function_name);
+
   __rtsan::PrintStackTrace(pc, bp);
 }
 
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h b/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h
index 6ca09cf657094..4ba4fc5e53086 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h
@@ -30,9 +30,10 @@ void ExpectRealtimeDeath(Function &&Func,
 
   auto GetExpectedErrorSubstring = [&]() -> std::string {
     return intercepted_method_name != nullptr
-               ? "Real-time violation: intercepted call to real-time unsafe "
-                 "function `" +
-                     std::string(intercepted_method_name) + "`"
+               ? ".*==ERROR: RealtimeSanitizer: unsafe-library-call.*"
+                 "Intercepted call to real-time unsafe function `" +
+                     std::string(intercepted_method_name) +
+                     "` in real-time context!"
                : "";
   };
 
diff --git a/compiler-rt/test/rtsan/basic.cpp b/compiler-rt/test/rtsan/basic.cpp
index f4075bb27e4f9..607db90213a30 100644
--- a/compiler-rt/test/rtsan/basic.cpp
+++ b/compiler-rt/test/rtsan/basic.cpp
@@ -17,6 +17,7 @@ void violation() [[clang::nonblocking]] {
 int main() {
   violation();
   return 0;
-  // CHECK: Real-time violation: intercepted call to real-time unsafe function `malloc` in real-time context! Stack trace:
+  // CHECK: ==ERROR: RealtimeSanitizer: unsafe-library-call
+  // CHECK-NEXT: Intercepted call to real-time unsafe function `malloc` in real-time context!
   // CHECK-NEXT: {{.*malloc*}}
 }
diff --git a/compiler-rt/test/rtsan/disabler.cpp b/compiler-rt/test/rtsan/disabler.cpp
index 0a6411a2be694..dd1d4439beae4 100644
--- a/compiler-rt/test/rtsan/disabler.cpp
+++ b/compiler-rt/test/rtsan/disabler.cpp
@@ -41,7 +41,7 @@ int main() {
   // CHECK: Allocated pointer {{.*}} in disabled context
   // CHECK: Allocated second pointer {{.*}} in disabled context
   // CHECK: Free'd second pointer in disabled context
-  // CHECK: {{.*Real-time violation.*}}
+  // CHECK: ==ERROR: RealtimeSanitizer: unsafe-library-call
   // CHECK-NOT: {{.*malloc*}}
   // CHECK-NEXT: {{.*free.*}}
 }

From 5495c36104103c4172808a28e8b2df3c806b1d85 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 10 Sep 2024 14:22:57 -0700
Subject: [PATCH 024/114] [WebAssembly] Misc. refactoring in AsmTypeCheck (NFC)
 (#107978)

Existing methods in AsmTypeCheck assumes symbol operand is the 0th
operand; they take a `MCInst` and take `getOperand(0)` on it. I think
passing a `MCOperand` removes this assumption and also is more
intuitive. This was motivated by a new `try_table` instruction, whose
support is going to be added to AsmTypeCheck soon, which has tag symbol
operands in any position, depending on the number and the kinds of catch
clauses. This PR changes all methods' signature that assumes the 0th
operand is the relevant one, even if it's not the symbol operand.

This also adds `getSignature` method, which factors out the common task
when getting a `WasmSignature` from a `MCOperand`.
---
 .../AsmParser/WebAssemblyAsmTypeCheck.cpp     | 92 +++++++++++--------
 .../AsmParser/WebAssemblyAsmTypeCheck.h       | 11 ++-
 2 files changed, 62 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
index 9f9e7d1c0ed06..ec3d51d4e0e84 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
@@ -112,9 +112,9 @@ bool WebAssemblyAsmTypeCheck::popRefType(SMLoc ErrorLoc) {
   return false;
 }
 
-bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCInst &Inst,
+bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCOperand &LocalOp,
                                        wasm::ValType &Type) {
-  auto Local = static_cast<size_t>(Inst.getOperand(0).getImm());
+  auto Local = static_cast<size_t>(LocalOp.getImm());
   if (Local >= LocalTypes.size())
     return typeError(ErrorLoc, StringRef("no local type specified for index ") +
                                    std::to_string(Local));
@@ -178,21 +178,21 @@ bool WebAssemblyAsmTypeCheck::checkSig(SMLoc ErrorLoc,
   return false;
 }
 
-bool WebAssemblyAsmTypeCheck::getSymRef(SMLoc ErrorLoc, const MCInst &Inst,
+bool WebAssemblyAsmTypeCheck::getSymRef(SMLoc ErrorLoc, const MCOperand &SymOp,
                                         const MCSymbolRefExpr *&SymRef) {
-  auto Op = Inst.getOperand(0);
-  if (!Op.isExpr())
+  if (!SymOp.isExpr())
     return typeError(ErrorLoc, StringRef("expected expression operand"));
-  SymRef = dyn_cast<MCSymbolRefExpr>(Op.getExpr());
+  SymRef = dyn_cast<MCSymbolRefExpr>(SymOp.getExpr());
   if (!SymRef)
     return typeError(ErrorLoc, StringRef("expected symbol operand"));
   return false;
 }
 
-bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCInst &Inst,
+bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc,
+                                        const MCOperand &GlobalOp,
                                         wasm::ValType &Type) {
   const MCSymbolRefExpr *SymRef;
-  if (getSymRef(ErrorLoc, Inst, SymRef))
+  if (getSymRef(ErrorLoc, GlobalOp, SymRef))
     return true;
   auto WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
   switch (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA)) {
@@ -217,10 +217,10 @@ bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCInst &Inst,
   return false;
 }
 
-bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCInst &Inst,
+bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCOperand &TableOp,
                                        wasm::ValType &Type) {
   const MCSymbolRefExpr *SymRef;
-  if (getSymRef(ErrorLoc, Inst, SymRef))
+  if (getSymRef(ErrorLoc, TableOp, SymRef))
     return true;
   auto WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
   if (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA) !=
@@ -231,6 +231,34 @@ bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCInst &Inst,
   return false;
 }
 
+bool WebAssemblyAsmTypeCheck::getSignature(SMLoc ErrorLoc,
+                                           const MCOperand &SigOp,
+                                           wasm::WasmSymbolType Type,
+                                           const wasm::WasmSignature *&Sig) {
+  const MCSymbolRefExpr *SymRef = nullptr;
+  if (getSymRef(ErrorLoc, SigOp, SymRef))
+    return true;
+  const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
+  Sig = WasmSym->getSignature();
+
+  if (!Sig || WasmSym->getType() != Type) {
+    const char *TypeName = nullptr;
+    switch (Type) {
+    case wasm::WASM_SYMBOL_TYPE_FUNCTION:
+      TypeName = "func";
+      break;
+    case wasm::WASM_SYMBOL_TYPE_TAG:
+      TypeName = "tag";
+      break;
+    default:
+      return true;
+    }
+    return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() +
+                                   ": missing ." + TypeName + "type");
+  }
+  return false;
+}
+
 bool WebAssemblyAsmTypeCheck::endOfFunction(SMLoc ErrorLoc) {
   // Check the return types.
   for (auto RVT : llvm::reverse(ReturnTypes)) {
@@ -252,48 +280,48 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
   dumpTypeStack("typechecking " + Name + ": ");
   wasm::ValType Type;
   if (Name == "local.get") {
-    if (getLocal(Operands[1]->getStartLoc(), Inst, Type))
+    if (getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     Stack.push_back(Type);
   } else if (Name == "local.set") {
-    if (getLocal(Operands[1]->getStartLoc(), Inst, Type))
+    if (getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, Type))
       return true;
   } else if (Name == "local.tee") {
-    if (getLocal(Operands[1]->getStartLoc(), Inst, Type))
+    if (getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, Type))
       return true;
     Stack.push_back(Type);
   } else if (Name == "global.get") {
-    if (getGlobal(Operands[1]->getStartLoc(), Inst, Type))
+    if (getGlobal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     Stack.push_back(Type);
   } else if (Name == "global.set") {
-    if (getGlobal(Operands[1]->getStartLoc(), Inst, Type))
+    if (getGlobal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, Type))
       return true;
   } else if (Name == "table.get") {
-    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+    if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, wasm::ValType::I32))
       return true;
     Stack.push_back(Type);
   } else if (Name == "table.set") {
-    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+    if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, Type))
       return true;
     if (popType(ErrorLoc, wasm::ValType::I32))
       return true;
   } else if (Name == "table.size") {
-    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+    if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     Stack.push_back(wasm::ValType::I32);
   } else if (Name == "table.grow") {
-    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+    if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, wasm::ValType::I32))
       return true;
@@ -301,7 +329,7 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
       return true;
     Stack.push_back(wasm::ValType::I32);
   } else if (Name == "table.fill") {
-    if (getTable(Operands[1]->getStartLoc(), Inst, Type))
+    if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return true;
     if (popType(ErrorLoc, wasm::ValType::I32))
       return true;
@@ -352,15 +380,10 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
       return true;
     Unreachable = false;
     if (Name == "catch") {
-      const MCSymbolRefExpr *SymRef;
-      if (getSymRef(Operands[1]->getStartLoc(), Inst, SymRef))
+      const wasm::WasmSignature *Sig = nullptr;
+      if (getSignature(Operands[1]->getStartLoc(), Inst.getOperand(0),
+                       wasm::WASM_SYMBOL_TYPE_TAG, Sig))
         return true;
-      const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
-      const auto *Sig = WasmSym->getSignature();
-      if (!Sig || WasmSym->getType() != wasm::WASM_SYMBOL_TYPE_TAG)
-        return typeError(Operands[1]->getStartLoc(), StringRef("symbol ") +
-                                                         WasmSym->getName() +
-                                                         ": missing .tagtype");
       // catch instruction pushes values whose types are specified in the tag's
       // "params" part
       Stack.insert(Stack.end(), Sig->Params.begin(), Sig->Params.end());
@@ -383,15 +406,10 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
     if (Name == "return_call_indirect" && endOfFunction(ErrorLoc))
       return true;
   } else if (Name == "call" || Name == "return_call") {
-    const MCSymbolRefExpr *SymRef;
-    if (getSymRef(Operands[1]->getStartLoc(), Inst, SymRef))
-      return true;
-    auto WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
-    auto Sig = WasmSym->getSignature();
-    if (!Sig || WasmSym->getType() != wasm::WASM_SYMBOL_TYPE_FUNCTION)
-      return typeError(Operands[1]->getStartLoc(), StringRef("symbol ") +
-                                                       WasmSym->getName() +
-                                                       ": missing .functype");
+    const wasm::WasmSignature *Sig = nullptr;
+    if (getSignature(Operands[1]->getStartLoc(), Inst.getOperand(0),
+                     wasm::WASM_SYMBOL_TYPE_FUNCTION, Sig))
+      return true;
     if (checkSig(ErrorLoc, *Sig))
       return true;
     if (Name == "return_call" && endOfFunction(ErrorLoc))
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
index 6fa95c3929753..9ba5693719e91 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
@@ -41,14 +41,17 @@ class WebAssemblyAsmTypeCheck final {
   bool typeError(SMLoc ErrorLoc, const Twine &Msg);
   bool popType(SMLoc ErrorLoc, std::optional<wasm::ValType> EVT);
   bool popRefType(SMLoc ErrorLoc);
-  bool getLocal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
+  bool getLocal(SMLoc ErrorLoc, const MCOperand &LocalOp, wasm::ValType &Type);
   bool checkEnd(SMLoc ErrorLoc, bool PopVals = false);
   bool checkBr(SMLoc ErrorLoc, size_t Level);
   bool checkSig(SMLoc ErrorLoc, const wasm::WasmSignature &Sig);
-  bool getSymRef(SMLoc ErrorLoc, const MCInst &Inst,
+  bool getSymRef(SMLoc ErrorLoc, const MCOperand &SymOp,
                  const MCSymbolRefExpr *&SymRef);
-  bool getGlobal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
-  bool getTable(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
+  bool getGlobal(SMLoc ErrorLoc, const MCOperand &GlobalOp,
+                 wasm::ValType &Type);
+  bool getTable(SMLoc ErrorLoc, const MCOperand &TableOp, wasm::ValType &Type);
+  bool getSignature(SMLoc ErrorLoc, const MCOperand &SigOp,
+                    wasm::WasmSymbolType Type, const wasm::WasmSignature *&Sig);
 
 public:
   WebAssemblyAsmTypeCheck(MCAsmParser &Parser, const MCInstrInfo &MII,

From ace6d5f2ce53ae88205fc39dafa45e5682fd9a52 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Tue, 10 Sep 2024 14:33:04 -0700
Subject: [PATCH 025/114] [SandboxIR] Fix base class of FenceInst. Verify
 instructions when building a BB in debug mode. (#108078)

@vporpo suggested in an offline conversation that verifying all
instructions during `BasicBlock::buildBasicBlockFromLLVMIR` would be a
good way to get coverage for errors like this during testing. He also
suggested not gating it on `SBVEC_EXPENSIVE_CHECKS` for now as the
checks are pretty basic at the moment and they only affect Debug builds.
---
 llvm/include/llvm/SandboxIR/SandboxIR.h |  6 ++----
 llvm/lib/SandboxIR/SandboxIR.cpp        | 10 +++++++++-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 8f025f7257b39..91d6b58cfee00 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -1194,9 +1194,7 @@ class BasicBlock : public Value {
   Instruction &back() const;
 
 #ifndef NDEBUG
-  void verify() const final {
-    assert(isa<llvm::BasicBlock>(Val) && "Expected BasicBlock!");
-  }
+  void verify() const final;
   void dumpOS(raw_ostream &OS) const final;
 #endif
 };
@@ -1435,7 +1433,7 @@ template <typename LLVMT> class SingleLLVMInstructionImpl : public Instruction {
 #endif
 };
 
-class FenceInst : public SingleLLVMInstructionImpl<llvm::SelectInst> {
+class FenceInst : public SingleLLVMInstructionImpl<llvm::FenceInst> {
   FenceInst(llvm::FenceInst *FI, Context &Ctx)
       : SingleLLVMInstructionImpl(ClassID::Fence, Opcode::Fence, FI, Ctx) {}
   friend Context; // For constructor;
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 07472d1bff47b..a4b68bd8ffd7c 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -3173,7 +3173,7 @@ void BasicBlock::buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB) {
       Ctx.getOrCreateValue(Op);
     }
   }
-#if !defined(NDEBUG) && defined(SBVEC_EXPENSIVE_CHECKS)
+#if !defined(NDEBUG)
   verify();
 #endif
 }
@@ -3249,4 +3249,12 @@ void BasicBlock::dumpOS(raw_ostream &OS) const {
     }
   }
 }
+
+void BasicBlock::verify() const {
+  assert(isa<llvm::BasicBlock>(Val) && "Expected BasicBlock!");
+  for (const auto &I : *this) {
+    I.verify();
+  }
+}
+
 #endif // NDEBUG

From 0fc4147c6d0c5bfb1fd9ed2a9f1c3a70e9281813 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 10 Sep 2024 14:23:16 -0700
Subject: [PATCH 026/114] [RISCV] Add test coverage for mul X, C where
 C=2^N*(3,5,9)*(3,5,9)

---
 llvm/test/CodeGen/RISCV/rv64zba.ll | 85 ++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 62595fd4a7ad6..19d7324eeff4a 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -569,6 +569,60 @@ define i64 @addmul72(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @mul50(i64 %a) {
+; CHECK-LABEL: mul50:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 50
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 50
+  ret i64 %c
+}
+
+define i64 @addmul50(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul50:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 50
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 50
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul100(i64 %a) {
+; CHECK-LABEL: mul100:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 100
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 100
+  ret i64 %c
+}
+
+define i64 @addmul100(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul100:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 100
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 100
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul162(i64 %a) {
+; CHECK-LABEL: mul162:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 162
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 162
+  ret i64 %c
+}
+
 define i64 @addmul162(i64 %a, i64 %b) {
 ; CHECK-LABEL: addmul162:
 ; CHECK:       # %bb.0:
@@ -581,6 +635,16 @@ define i64 @addmul162(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @mul180(i64 %a) {
+; CHECK-LABEL: mul180:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 180
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 180
+  ret i64 %c
+}
+
 define i64 @addmul180(i64 %a, i64 %b) {
 ; CHECK-LABEL: addmul180:
 ; CHECK:       # %bb.0:
@@ -605,6 +669,27 @@ define i64 @add255mul180(i64 %a) {
   ret i64 %d
 }
 
+define i64 @mul200(i64 %a) {
+; CHECK-LABEL: mul200:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 200
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 200
+  ret i64 %c
+}
+
+define i64 @addmul200(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul200:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 200
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 200
+  %d = add i64 %c, %b
+  ret i64 %d
+}
 
 define i64 @addmul4096(i64 %a, i64 %b) {
 ; CHECK-LABEL: addmul4096:

From 2ddf21bc702de25a34bb4a13b3610d8cc6bf3ca0 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Tue, 10 Sep 2024 14:42:09 -0700
Subject: [PATCH 027/114] [SandboxIR] Pass registry (#108084)

This patch implements a simple Pass Registry class, which takes
ownership of the passes registered with it and provides an interface to
get the pass pointer by its name.
---
 llvm/include/llvm/SandboxIR/PassManager.h | 29 +++++++++++++++++++++++
 llvm/lib/SandboxIR/Pass.cpp               |  1 +
 llvm/lib/SandboxIR/PassManager.cpp        |  6 +++++
 llvm/unittests/SandboxIR/PassTest.cpp     | 29 +++++++++++++++++++++++
 4 files changed, 65 insertions(+)

diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h
index cb321fe699a56..5e250641f3b3f 100644
--- a/llvm/include/llvm/SandboxIR/PassManager.h
+++ b/llvm/include/llvm/SandboxIR/PassManager.h
@@ -18,6 +18,7 @@
 #ifndef LLVM_SANDBOXIR_PASSMANAGER_H
 #define LLVM_SANDBOXIR_PASSMANAGER_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/SandboxIR/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -65,6 +66,34 @@ class FunctionPassManager final
   bool runOnFunction(Function &F) final;
 };
 
+/// Owns the passes and provides an API to get a pass by its name.
+class PassRegistry {
+  SmallVector<std::unique_ptr<Pass>, 8> Passes;
+  DenseMap<StringRef, Pass *> NameToPassMap;
+
+public:
+  PassRegistry() = default;
+  /// Registers \p PassPtr and takes ownership.
+  Pass &registerPass(std::unique_ptr<Pass> &&PassPtr) {
+    auto &PassRef = *PassPtr.get();
+    NameToPassMap[PassRef.getName()] = &PassRef;
+    Passes.push_back(std::move(PassPtr));
+    return PassRef;
+  }
+  /// \Returns the pass with name \p Name, or null if not registered.
+  Pass *getPassByName(StringRef Name) const {
+    auto It = NameToPassMap.find(Name);
+    return It != NameToPassMap.end() ? It->second : nullptr;
+  }
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const {
+    for (const auto &PassPtr : Passes)
+      OS << PassPtr->getName() << "\n";
+  }
+  LLVM_DUMP_METHOD void dump() const;
+#endif
+};
+
 } // namespace llvm::sandboxir
 
 #endif // LLVM_SANDBOXIR_PASSMANAGER_H
diff --git a/llvm/lib/SandboxIR/Pass.cpp b/llvm/lib/SandboxIR/Pass.cpp
index 64e1b609a9f49..c6ec1aec48b19 100644
--- a/llvm/lib/SandboxIR/Pass.cpp
+++ b/llvm/lib/SandboxIR/Pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/SandboxIR/Pass.h"
+#include "llvm/SandboxIR/PassManager.h"
 #include "llvm/Support/Debug.h"
 
 using namespace llvm::sandboxir;
diff --git a/llvm/lib/SandboxIR/PassManager.cpp b/llvm/lib/SandboxIR/PassManager.cpp
index d10f3926f7bcd..2dd19e74734db 100644
--- a/llvm/lib/SandboxIR/PassManager.cpp
+++ b/llvm/lib/SandboxIR/PassManager.cpp
@@ -20,3 +20,9 @@ bool FunctionPassManager::runOnFunction(Function &F) {
   // TODO: Check ChangeAll against hashes before/after.
   return Change;
 }
+#ifndef NDEBUG
+void PassRegistry::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp
index 8e080128b15b3..3517f0e32b1bb 100644
--- a/llvm/unittests/SandboxIR/PassTest.cpp
+++ b/llvm/unittests/SandboxIR/PassTest.cpp
@@ -133,3 +133,32 @@ define void @foo() {
   EXPECT_EQ(Buff, "test-fpm(test-pass1,test-pass2)");
 #endif // NDEBUG
 }
+
+TEST_F(PassTest, PassRegistry) {
+  class TestPass1 final : public FunctionPass {
+  public:
+    TestPass1() : FunctionPass("test-pass1") {}
+    bool runOnFunction(Function &F) final { return false; }
+  };
+  class TestPass2 final : public FunctionPass {
+  public:
+    TestPass2() : FunctionPass("test-pass2") {}
+    bool runOnFunction(Function &F) final { return false; }
+  };
+
+  PassRegistry Registry;
+  auto &TP1 = Registry.registerPass(std::make_unique<TestPass1>());
+  auto &TP2 = Registry.registerPass(std::make_unique<TestPass2>());
+
+  // Check getPassByName().
+  EXPECT_EQ(Registry.getPassByName("test-pass1"), &TP1);
+  EXPECT_EQ(Registry.getPassByName("test-pass2"), &TP2);
+
+#ifndef NDEBUG
+  // Check print().
+  std::string Buff;
+  llvm::raw_string_ostream SS(Buff);
+  Registry.print(SS);
+  EXPECT_EQ(Buff, "test-pass1\ntest-pass2\n");
+#endif // NDEBUG
+}

From d452429821d3263a73b27387324bc272b47ed1bf Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 15:00:42 -0700
Subject: [PATCH 028/114] [flang] Fix shared library flang build (#108101)

I broke the shared library builds a few minutes ago by introducing a
cyclic dependency between two parts of the compiler. Fix.
---
 flang/lib/Evaluate/check-expression.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index 8a90404db0456..a1ede7d7553bf 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -525,6 +525,11 @@ class CheckSpecificationExprHelper
 
   Result operator()(const semantics::Symbol &symbol) const {
     const auto &ultimate{symbol.GetUltimate()};
+    const auto *object{ultimate.detailsIf<semantics::ObjectEntityDetails>()};
+    bool isInitialized{semantics::IsSaved(ultimate) &&
+        !IsAllocatable(ultimate) && object &&
+        (ultimate.test(Symbol::Flag::InDataStmt) ||
+            object->init().has_value())};
     if (const auto *assoc{
             ultimate.detailsIf<semantics::AssocEntityDetails>()}) {
       return (*this)(assoc->expr());
@@ -554,8 +559,7 @@ class CheckSpecificationExprHelper
       }
     } else if (&symbol.owner() != &scope_ || &ultimate.owner() != &scope_) {
       return std::nullopt; // host association is in play
-    } else if (semantics::IsSaved(ultimate) &&
-        semantics::IsInitialized(ultimate) &&
+    } else if (isInitialized &&
         context_.languageFeatures().IsEnabled(
             common::LanguageFeature::SavedLocalInSpecExpr)) {
       if (!scope_.IsModuleFile() &&

From 957af7373881e62eec34ca87106fa2a2c2391d8e Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 10 Sep 2024 15:07:13 -0700
Subject: [PATCH 029/114] [sanitizer] Add CHECKs to validate calculated TLS
 range (#107941)

---
 compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
index 666e6f3b35106..a1107ff7d2473 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
@@ -151,6 +151,10 @@ DTLS::DTV *DTLS_on_tls_get_addr(void *arg_void, void *res,
     // This may happen inside the DTOR of main thread, so just ignore it.
     tls_size = 0;
   }
+  if (tls_size) {
+    CHECK_LE(tls_beg, reinterpret_cast<uptr>(res));
+    CHECK_LT(reinterpret_cast<uptr>(res), tls_beg + tls_size);
+  }
   dtv->beg = tls_beg;
   dtv->size = tls_size;
   return dtv;

From 10c04d9873dbbbca26f4d996396da297b9144add Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Tue, 10 Sep 2024 15:12:56 -0700
Subject: [PATCH 030/114] [LLDB]Skip Summary Statistics Tests for Windows
 (#108079)

Follow up to #102708, the tests are failing for windows. There is a
large variance in these tests between summary strings and built in
types. I'm disabling these test for windows, and will add windows
specific tests as a follow up to this.
---
 lldb/test/API/commands/statistics/basic/TestStats.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/test/API/commands/statistics/basic/TestStats.py b/lldb/test/API/commands/statistics/basic/TestStats.py
index 03ec169344705..a0a9eeb649320 100644
--- a/lldb/test/API/commands/statistics/basic/TestStats.py
+++ b/lldb/test/API/commands/statistics/basic/TestStats.py
@@ -921,6 +921,7 @@ def test_order_of_options_do_not_matter(self):
                 f"The order of options '{options[0]}' and '{options[1]}' should not matter",
             )
 
+    @skipIfWindows
     def test_summary_statistics_providers(self):
         """
         Test summary timing statistics is included in statistics dump when
@@ -960,6 +961,7 @@ def test_summary_statistics_providers(self):
         self.assertIn("'totalTime':", summary_provider_str)
         self.assertIn("'type': 'python'", summary_provider_str)
 
+    @skipIfWindows
     def test_summary_statistics_providers_vec(self):
         """
         Test summary timing statistics is included in statistics dump when

From 6007ad79afeffb1288781b4a7241290386293aff Mon Sep 17 00:00:00 2001
From: "Henrik G. Olsson" <hnrklssn@gmail.com>
Date: Tue, 10 Sep 2024 15:19:52 -0700
Subject: [PATCH 031/114] Revert "[llvm-lit] Process ANSI color codes in test
 output when formatting" (#108104)

Reverts llvm/llvm-project#106776 because of a test failure on Windows.
---
 llvm/utils/lit/lit/TestRunner.py              | 28 ++-----------------
 .../Inputs/escape-color/color-escaped.txt     | 10 -------
 .../lit/tests/Inputs/escape-color/color.txt   |  6 ----
 .../lit/tests/Inputs/escape-color/lit.cfg     |  8 ------
 llvm/utils/lit/tests/escape-color.py          |  4 ---
 5 files changed, 2 insertions(+), 54 deletions(-)
 delete mode 100644 llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt
 delete mode 100644 llvm/utils/lit/tests/Inputs/escape-color/color.txt
 delete mode 100644 llvm/utils/lit/tests/Inputs/escape-color/lit.cfg
 delete mode 100644 llvm/utils/lit/tests/escape-color.py

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index a2c76d41a43e0..19f35fc7e212f 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -1017,20 +1017,6 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
     return exitCode
 
 
-def findColor(line, curr_color):
-    start = line.rfind("\33[")
-    if start == -1:
-        return curr_color
-    end = line.find("m", start+2)
-    if end == -1:
-        return curr_color
-    match = line[start:end+1]
-    # "\33[0m" means "reset all formatting". Sometimes the 0 is skipped.
-    if match == "\33[m" or match == "\33[0m":
-        return None
-    return match
-
-
 def formatOutput(title, data, limit=None):
     if not data.strip():
         return ""
@@ -1041,18 +1027,8 @@ def formatOutput(title, data, limit=None):
         msg = ""
     ndashes = 30
     # fmt: off
-    out = f"# .---{title}{'-' * (ndashes - 4 - len(title))}\n"
-    curr_color = None
-    for line in data.splitlines():
-        if curr_color:
-            out += "\33[0m"
-        out += "# | "
-        if curr_color:
-            out += curr_color
-        out += line + "\n"
-        curr_color = findColor(line, curr_color)
-    if curr_color:
-        out += "\33[0m"  # prevent unterminated formatting from leaking
+    out =  f"# .---{title}{'-' * (ndashes - 4 - len(title))}\n"
+    out += f"# | " + "\n# | ".join(data.splitlines()) + "\n"
     out += f"# `---{msg}{'-' * (ndashes - 4 - len(msg))}\n"
     # fmt: on
     return out
diff --git a/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt b/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt
deleted file mode 100644
index e7a33e380b351..0000000000000
--- a/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# .---command stdout------------
-# | # RUN: cat %s
-# | [31mred
-[0m# | [31mstill red(B[m
-# | plain
-# | [32mgreen
-[0m# | [32mstill green (never terminated)
-[0m# `-----------------------------
-
---
diff --git a/llvm/utils/lit/tests/Inputs/escape-color/color.txt b/llvm/utils/lit/tests/Inputs/escape-color/color.txt
deleted file mode 100644
index 15ffc22d134f0..0000000000000
--- a/llvm/utils/lit/tests/Inputs/escape-color/color.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# RUN: cat %s
-[31mred
-still red(B[m
-plain
-[32mgreen
-still green (never terminated)
diff --git a/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg b/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg
deleted file mode 100644
index 36f4eb69d4858..0000000000000
--- a/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg
+++ /dev/null
@@ -1,8 +0,0 @@
-import lit.formats
-
-config.name = "escape-color"
-config.suffixes = [".txt"]
-config.test_format = lit.formats.ShTest()
-config.test_source_root = None
-config.test_exec_root = None
-
diff --git a/llvm/utils/lit/tests/escape-color.py b/llvm/utils/lit/tests/escape-color.py
deleted file mode 100644
index 1d0b93b004e9d..0000000000000
--- a/llvm/utils/lit/tests/escape-color.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# cut off the first 9 lines to avoid absolute file paths in the output
-# then keep only the next 10 lines to avoid test timing in the output
-# RUN: %{lit} %{inputs}/escape-color/color.txt -a | tail -n +10 | head -n 10 > %t
-# RUN: diff %{inputs}/escape-color/color-escaped.txt %t

From d14a600b1eb650f05fcd56a7b790e30f1f52e751 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Tue, 10 Sep 2024 15:28:21 -0700
Subject: [PATCH 032/114] [SandboxIR] Implement BlockAddress (#107940)

This patch implements sandboxir::BlockAddress mirroring
llvm:BlockAddress.
---
 llvm/include/llvm/SandboxIR/SandboxIR.h       | 29 +++++++++++
 .../llvm/SandboxIR/SandboxIRValues.def        |  1 +
 llvm/lib/SandboxIR/SandboxIR.cpp              | 32 ++++++++++++-
 llvm/unittests/SandboxIR/SandboxIRTest.cpp    | 48 +++++++++++++++++++
 4 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 91d6b58cfee00..2fdbbbd094650 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -123,6 +123,7 @@ class ConstantFP;
 class ConstantAggregateZero;
 class ConstantPointerNull;
 class PoisonValue;
+class BlockAddress;
 class Context;
 class Function;
 class Instruction;
@@ -323,6 +324,7 @@ class Value {
   friend class ConstantPointerNull;   // For `Val`.
   friend class UndefValue;            // For `Val`.
   friend class PoisonValue;           // For `Val`.
+  friend class BlockAddress;          // For `Val`.
 
   /// All values point to the context.
   Context &Ctx;
@@ -1112,6 +1114,33 @@ class PoisonValue final : public UndefValue {
 #endif
 };
 
+class BlockAddress final : public Constant {
+  BlockAddress(llvm::BlockAddress *C, Context &Ctx)
+      : Constant(ClassID::BlockAddress, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// Return a BlockAddress for the specified function and basic block.
+  static BlockAddress *get(Function *F, BasicBlock *BB);
+
+  /// Return a BlockAddress for the specified basic block.  The basic
+  /// block must be embedded into a function.
+  static BlockAddress *get(BasicBlock *BB);
+
+  /// Lookup an existing \c BlockAddress constant for the given BasicBlock.
+  ///
+  /// \returns 0 if \c !BB->hasAddressTaken(), otherwise the \c BlockAddress.
+  static BlockAddress *lookup(const BasicBlock *BB);
+
+  Function *getFunction() const;
+  BasicBlock *getBasicBlock() const;
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::BlockAddress;
+  }
+};
+
 /// Iterator for `Instruction`s in a `BasicBlock.
 /// \Returns an sandboxir::Instruction & when derereferenced.
 class BBIterator {
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index 459226216703d..c29e8be24ea75 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -34,6 +34,7 @@ DEF_CONST(ConstantAggregateZero, ConstantAggregateZero)
 DEF_CONST(ConstantPointerNull, ConstantPointerNull)
 DEF_CONST(UndefValue, UndefValue)
 DEF_CONST(PoisonValue, PoisonValue)
+DEF_CONST(BlockAddress, BlockAddress)
 
 #ifndef DEF_INSTR
 #define DEF_INSTR(ID, OPCODE, CLASS)
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index a4b68bd8ffd7c..18fdcda15a1a9 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -2489,6 +2489,32 @@ PoisonValue *PoisonValue::getElementValue(unsigned Idx) const {
       cast<llvm::PoisonValue>(Val)->getElementValue(Idx)));
 }
 
+BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) {
+  auto *LLVMC = llvm::BlockAddress::get(cast<llvm::Function>(F->Val),
+                                        cast<llvm::BasicBlock>(BB->Val));
+  return cast<BlockAddress>(F->getContext().getOrCreateConstant(LLVMC));
+}
+
+BlockAddress *BlockAddress::get(BasicBlock *BB) {
+  auto *LLVMC = llvm::BlockAddress::get(cast<llvm::BasicBlock>(BB->Val));
+  return cast<BlockAddress>(BB->getContext().getOrCreateConstant(LLVMC));
+}
+
+BlockAddress *BlockAddress::lookup(const BasicBlock *BB) {
+  auto *LLVMC = llvm::BlockAddress::lookup(cast<llvm::BasicBlock>(BB->Val));
+  return cast_or_null<BlockAddress>(BB->getContext().getValue(LLVMC));
+}
+
+Function *BlockAddress::getFunction() const {
+  return cast<Function>(
+      Ctx.getValue(cast<llvm::BlockAddress>(Val)->getFunction()));
+}
+
+BasicBlock *BlockAddress::getBasicBlock() const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::BlockAddress>(Val)->getBasicBlock()));
+}
+
 FunctionType *Function::getFunctionType() const {
   return cast<FunctionType>(
       Ctx.getType(cast<llvm::Function>(Val)->getFunctionType()));
@@ -2585,6 +2611,10 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
       It->second = std::unique_ptr<ConstantFP>(
           new ConstantFP(cast<llvm::ConstantFP>(C), *this));
       return It->second.get();
+    case llvm::Value::BlockAddressVal:
+      It->second = std::unique_ptr<BlockAddress>(
+          new BlockAddress(cast<llvm::BlockAddress>(C), *this));
+      return It->second.get();
     case llvm::Value::ConstantAggregateZeroVal: {
       auto *CAZ = cast<llvm::ConstantAggregateZero>(C);
       It->second = std::unique_ptr<ConstantAggregateZero>(
@@ -2640,7 +2670,7 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     return It->second.get();
   }
   if (auto *BB = dyn_cast<llvm::BasicBlock>(LLVMV)) {
-    assert(isa<BlockAddress>(U) &&
+    assert(isa<llvm::BlockAddress>(U) &&
            "This won't create a SBBB, don't call this function directly!");
     if (auto *SBBB = getValue(BB))
       return SBBB;
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 1b939b4d047aa..b76d24dc297b9 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -729,6 +729,54 @@ define void @foo() {
   EXPECT_EQ(UndefStruct->getNumElements(), 2u);
 }
 
+TEST_F(SandboxIRTest, BlockAddress) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr) {
+bb0:
+  store ptr blockaddress(@foo, %bb0), ptr %ptr
+  ret void
+bb1:
+  ret void
+bb2:
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *BB1 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb1")));
+  auto *BB2 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb2")));
+  auto It = BB0->begin();
+  auto *SI = cast<sandboxir::StoreInst>(&*It++);
+  [[maybe_unused]] auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  // Check classof(), creation, getFunction(), getBasicBlock().
+  auto *BB0Addr = cast<sandboxir::BlockAddress>(SI->getValueOperand());
+  EXPECT_EQ(BB0Addr->getBasicBlock(), BB0);
+  EXPECT_EQ(BB0Addr->getFunction(), &F);
+  // Check get(F, BB).
+  auto *NewBB0Addr = sandboxir::BlockAddress::get(&F, BB0);
+  EXPECT_EQ(NewBB0Addr, BB0Addr);
+  // Check get(BB).
+  auto *NewBB0Addr2 = sandboxir::BlockAddress::get(BB0);
+  EXPECT_EQ(NewBB0Addr2, BB0Addr);
+  auto *BB1Addr = sandboxir::BlockAddress::get(BB1);
+  EXPECT_EQ(BB1Addr->getBasicBlock(), BB1);
+  EXPECT_NE(BB1Addr, BB0Addr);
+  // Check lookup().
+  auto *LookupBB0Addr = sandboxir::BlockAddress::lookup(BB0);
+  EXPECT_EQ(LookupBB0Addr, BB0Addr);
+  auto *LookupBB1Addr = sandboxir::BlockAddress::lookup(BB1);
+  EXPECT_EQ(LookupBB1Addr, BB1Addr);
+  auto *LookupBB2Addr = sandboxir::BlockAddress::lookup(BB2);
+  EXPECT_EQ(LookupBB2Addr, nullptr);
+}
+
 TEST_F(SandboxIRTest, Use) {
   parseIR(C, R"IR(
 define i32 @foo(i32 %v0, i32 %v1) {

From bb7286515c0b285382f370232f97ffa7cfcbc550 Mon Sep 17 00:00:00 2001
From: Sterling-Augustine
 <56981066+Sterling-Augustine@users.noreply.github.com>
Date: Tue, 10 Sep 2024 22:44:30 +0000
Subject: [PATCH 033/114] [SandboxIR] Implement FixedVectorType (#107930)

---
 llvm/include/llvm/SandboxIR/Type.h     | 46 ++++++++++++++++++++
 llvm/lib/SandboxIR/Type.cpp            |  5 +++
 llvm/unittests/SandboxIR/TypesTest.cpp | 58 ++++++++++++++++++++++++++
 3 files changed, 109 insertions(+)

diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h
index 44aee4e4a5b46..ec141c249fb21 100644
--- a/llvm/include/llvm/SandboxIR/Type.h
+++ b/llvm/include/llvm/SandboxIR/Type.h
@@ -25,6 +25,7 @@ class Context;
 // Forward declare friend classes for MSVC.
 class PointerType;
 class VectorType;
+class FixedVectorType;
 class IntegerType;
 class FunctionType;
 class ArrayType;
@@ -41,6 +42,7 @@ class Type {
   friend class ArrayType;      // For LLVMTy.
   friend class StructType;     // For LLVMTy.
   friend class VectorType;     // For LLVMTy.
+  friend class FixedVectorType; // For LLVMTy.
   friend class PointerType;    // For LLVMTy.
   friend class FunctionType;   // For LLVMTy.
   friend class IntegerType;    // For LLVMTy.
@@ -344,6 +346,50 @@ class VectorType : public Type {
   }
 };
 
+class FixedVectorType : public VectorType {
+public:
+  static FixedVectorType *get(Type *ElementType, unsigned NumElts);
+
+  static FixedVectorType *get(Type *ElementType, const FixedVectorType *FVTy) {
+    return get(ElementType, FVTy->getNumElements());
+  }
+
+  static FixedVectorType *getInteger(FixedVectorType *VTy) {
+    return cast<FixedVectorType>(VectorType::getInteger(VTy));
+  }
+
+  static FixedVectorType *getExtendedElementVectorType(FixedVectorType *VTy) {
+    return cast<FixedVectorType>(VectorType::getExtendedElementVectorType(VTy));
+  }
+
+  static FixedVectorType *getTruncatedElementVectorType(FixedVectorType *VTy) {
+    return cast<FixedVectorType>(
+        VectorType::getTruncatedElementVectorType(VTy));
+  }
+
+  static FixedVectorType *getSubdividedVectorType(FixedVectorType *VTy,
+                                                  int NumSubdivs) {
+    return cast<FixedVectorType>(
+        VectorType::getSubdividedVectorType(VTy, NumSubdivs));
+  }
+
+  static FixedVectorType *getHalfElementsVectorType(FixedVectorType *VTy) {
+    return cast<FixedVectorType>(VectorType::getHalfElementsVectorType(VTy));
+  }
+
+  static FixedVectorType *getDoubleElementsVectorType(FixedVectorType *VTy) {
+    return cast<FixedVectorType>(VectorType::getDoubleElementsVectorType(VTy));
+  }
+
+  static bool classof(const Type *T) {
+    return isa<llvm::FixedVectorType>(T->LLVMTy);
+  }
+
+  unsigned getNumElements() const {
+    return cast<llvm::FixedVectorType>(LLVMTy)->getNumElements();
+  }
+};
+
 class FunctionType : public Type {
 public:
   // TODO: add missing functions
diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp
index bf9f02e2ba311..26aa8b3743084 100644
--- a/llvm/lib/SandboxIR/Type.cpp
+++ b/llvm/lib/SandboxIR/Type.cpp
@@ -103,6 +103,11 @@ bool VectorType::isValidElementType(Type *ElemTy) {
   return llvm::VectorType::isValidElementType(ElemTy->LLVMTy);
 }
 
+FixedVectorType *FixedVectorType::get(Type *ElementType, unsigned NumElts) {
+  return cast<FixedVectorType>(ElementType->getContext().getType(
+      llvm::FixedVectorType::get(ElementType->LLVMTy, NumElts)));
+}
+
 IntegerType *IntegerType::get(Context &Ctx, unsigned NumBits) {
   return cast<IntegerType>(
       Ctx.getType(llvm::IntegerType::get(Ctx.LLVMCtx, NumBits)));
diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp
index e4f9235c1ef3c..3564ae6683014 100644
--- a/llvm/unittests/SandboxIR/TypesTest.cpp
+++ b/llvm/unittests/SandboxIR/TypesTest.cpp
@@ -323,6 +323,64 @@ define void @foo(<4 x i16> %vi0, <4 x float> %vf1, i8 %i0) {
   EXPECT_FALSE(sandboxir::VectorType::isValidElementType(FVecTy));
 }
 
+TEST_F(SandboxTypeTest, FixedVectorType) {
+  parseIR(C, R"IR(
+define void @foo(<4 x i16> %vi0, <4 x float> %vf1, i8 %i0) {
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  // Check classof(), creation, accessors
+  auto *Vec4i16Ty = cast<sandboxir::FixedVectorType>(F->getArg(0)->getType());
+  EXPECT_TRUE(Vec4i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec4i16Ty->getElementCount(), ElementCount::getFixed(4));
+
+  // get(ElementType, NumElements)
+  EXPECT_EQ(
+      sandboxir::FixedVectorType::get(sandboxir::Type::getInt16Ty(Ctx), 4),
+      F->getArg(0)->getType());
+  // get(ElementType, Other)
+  EXPECT_EQ(sandboxir::FixedVectorType::get(
+                sandboxir::Type::getInt16Ty(Ctx),
+                cast<sandboxir::FixedVectorType>(F->getArg(0)->getType())),
+            F->getArg(0)->getType());
+  auto *Vec4FTy = cast<sandboxir::FixedVectorType>(F->getArg(1)->getType());
+  EXPECT_TRUE(Vec4FTy->getElementType()->isFloatTy());
+  // getInteger
+  auto *Vec4i32Ty = sandboxir::FixedVectorType::getInteger(Vec4FTy);
+  EXPECT_TRUE(Vec4i32Ty->getElementType()->isIntegerTy(32));
+  EXPECT_EQ(Vec4i32Ty->getElementCount(), Vec4FTy->getElementCount());
+  // getExtendedElementCountVectorType
+  auto *Vec4i64Ty =
+      sandboxir::FixedVectorType::getExtendedElementVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec4i64Ty->getElementType()->isIntegerTy(32));
+  EXPECT_EQ(Vec4i64Ty->getElementCount(), Vec4i16Ty->getElementCount());
+  // getTruncatedElementVectorType
+  auto *Vec4i8Ty =
+      sandboxir::FixedVectorType::getTruncatedElementVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec4i8Ty->getElementType()->isIntegerTy(8));
+  EXPECT_EQ(Vec4i8Ty->getElementCount(), Vec4i8Ty->getElementCount());
+  // getSubdividedVectorType
+  auto *Vec8i8Ty =
+      sandboxir::FixedVectorType::getSubdividedVectorType(Vec4i16Ty, 1);
+  EXPECT_TRUE(Vec8i8Ty->getElementType()->isIntegerTy(8));
+  EXPECT_EQ(Vec8i8Ty->getElementCount(), ElementCount::getFixed(8));
+  // getNumElements
+  EXPECT_EQ(Vec8i8Ty->getNumElements(), 8u);
+  // getHalfElementsVectorType
+  auto *Vec2i16Ty =
+      sandboxir::FixedVectorType::getHalfElementsVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec2i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec2i16Ty->getElementCount(), ElementCount::getFixed(2));
+  // getDoubleElementsVectorType
+  auto *Vec8i16Ty =
+      sandboxir::FixedVectorType::getDoubleElementsVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec8i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec8i16Ty->getElementCount(), ElementCount::getFixed(8));
+}
+
 TEST_F(SandboxTypeTest, FunctionType) {
   parseIR(C, R"IR(
 define void @foo() {

From 5804193e38680683b370cb3ced46c018d4dbd1b2 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Tue, 10 Sep 2024 16:14:07 -0700
Subject: [PATCH 034/114] Revert "[sanitizer] Add CHECKs to validate calculated
 TLS range" (#108112)

Reverts llvm/llvm-project#107941

Broke PPC bot
---
 compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
index a1107ff7d2473..666e6f3b35106 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
@@ -151,10 +151,6 @@ DTLS::DTV *DTLS_on_tls_get_addr(void *arg_void, void *res,
     // This may happen inside the DTOR of main thread, so just ignore it.
     tls_size = 0;
   }
-  if (tls_size) {
-    CHECK_LE(tls_beg, reinterpret_cast<uptr>(res));
-    CHECK_LT(reinterpret_cast<uptr>(res), tls_beg + tls_size);
-  }
   dtv->beg = tls_beg;
   dtv->size = tls_size;
   return dtv;

From 829ea59ddaf0ddfa1d9316a9260bd3ba17562ffe Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 10 Sep 2024 16:21:01 -0700
Subject: [PATCH 035/114] [docs] Add a section on AI-generated content to the
 developer policy (#91014)

Governments around the world are starting to require labelling for
AI-generated content, and some LLVM stakeholders have asked if LLVM
contains AI-generated content. Defining a policy on the use of AI tools
allows us to answer that question affirmatively, one way of the other.

The policy proposed here allows the use of AI tools in LLVM
contributions, flowing from the idea that any contribution is fine
regardless of how it is made, as long as the contributor has the right
to license it under the project license.

I gathered input from the community in this RFC and incorporated it into the policy:
https://discourse.llvm.org/t/rfc-define-policy-on-ai-tool-usage-in-contributions/78758
---
 llvm/docs/DeveloperPolicy.rst | 28 +++++++++++++++++++++++++++-
 llvm/docs/FAQ.rst             |  7 +++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst
index 49ec310b382f9..f74adc4702d38 100644
--- a/llvm/docs/DeveloperPolicy.rst
+++ b/llvm/docs/DeveloperPolicy.rst
@@ -1077,6 +1077,8 @@ If you have questions or comments about these topics, please ask on the
 please realize that most compiler developers are not lawyers, and therefore you
 will not be getting official legal advice.
 
+.. _LLVM Discourse forums: https://discourse.llvm.org
+
 Copyright
 ---------
 
@@ -1301,4 +1303,28 @@ to move code from (e.g.)  libc++ to the LLVM core without concern, but that code
 cannot be moved from the LLVM core to libc++ without the copyright owner's
 permission.
 
-.. _LLVM Discourse forums: https://discourse.llvm.org
+.. _ai contributions:
+
+AI generated contributions
+--------------------------
+
+Artificial intelligence systems raise many questions around copyright that have
+yet to be answered. Our policy on AI tools is guided by our copyright policy:
+Contributors are responsible for ensuring that they have the right to contribute
+code under the terms of our license, typically meaning that either they, their
+employer, or their collaborators hold the copyright. Using AI tools to
+regenerate copyrighted material does not remove the copyright, and contributors
+are responsible for ensuring that such material does not appear in their
+contributions.
+
+As such, the LLVM policy is that contributors are permitted to use artificial
+intelligence tools to produce contributions, provided that they have the right
+to license that code under the project license. Contributions found to violate
+this policy will be removed just like any other offending contribution.
+
+While the LLVM project has a liberal policy on AI tool use, contributors are
+considered responsible for their contributions. We encourage contributors to
+review all generated code before sending it for review to verify its
+correctness and to understand it so that they can answer questions during code
+review. Reviewing and maintaining generated code that the original contributor
+does not understand is not a good use of limited project resources.
diff --git a/llvm/docs/FAQ.rst b/llvm/docs/FAQ.rst
index 229ac99f703c1..aa20de47a6998 100644
--- a/llvm/docs/FAQ.rst
+++ b/llvm/docs/FAQ.rst
@@ -22,6 +22,13 @@ Yes. This is why we distribute LLVM under a less restrictive license than GPL,
 as explained in the first question above.
 
 
+Can I use AI coding tools, such as GitHub co-pilot, to write LLVM patches?
+--------------------------------------------------------------------------
+Yes, as long as the resulting work can be licensed under the project license, as
+covered in the :doc:`DeveloperPolicy`. Using an AI tool to reproduce copyrighted
+work does not rinse it of copyright and grant you the right to relicense it.
+
+
 Source Code
 ===========
 

From ae5f1a78d3a930466f927989faac8e0b9d820a7b Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 10 Sep 2024 16:27:56 -0700
Subject: [PATCH 036/114] [MemProf] Convert CallContextInfo to a struct (NFC)
 (#108086)

As suggested in #107918, improve readability by converting this tuple to
a struct.
---
 .../IPO/MemProfContextDisambiguation.cpp      | 42 ++++++++++++-------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 5c09bb1800cb2..fa25baee2ba03 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -470,8 +470,20 @@ class CallsiteContextGraph {
 private:
   using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
 
-  using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
-                                     const FuncTy *, DenseSet<uint32_t>>;
+  // Structure to keep track of information for each call as we are matching
+  // non-allocation callsites onto context nodes created from the allocation
+  // call metadata / summary contexts.
+  struct CallContextInfo {
+    // The callsite we're trying to match.
+    CallTy Call;
+    // The callsites stack ids that have a context node in the graph.
+    std::vector<uint64_t> StackIds;
+    // The function containing this callsite.
+    const FuncTy *Func;
+    // Initially empty, if needed this will be updated to contain the context
+    // ids for use in a new context node created for this callsite.
+    DenseSet<uint32_t> ContextIds;
+  };
 
   /// Assigns the given Node to calls at or inlined into the location with
   /// the Node's stack id, after post order traversing and processing its
@@ -1458,7 +1470,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     auto &Calls = It.getSecond();
     // Skip single calls with a single stack id. These don't need a new node.
     if (Calls.size() == 1) {
-      auto &Ids = std::get<1>(Calls[0]);
+      auto &Ids = Calls[0].StackIds;
       if (Ids.size() == 1)
         continue;
     }
@@ -1474,18 +1486,15 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     // that to sort by.
     DenseMap<const FuncTy *, unsigned> FuncToIndex;
     for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
-      FuncToIndex.insert({std::get<2>(CallCtxInfo), Idx});
+      FuncToIndex.insert({CallCtxInfo.Func, Idx});
     std::stable_sort(
         Calls.begin(), Calls.end(),
         [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
-          auto &IdsA = std::get<1>(A);
-          auto &IdsB = std::get<1>(B);
-          auto *FuncA = std::get<2>(A);
-          auto *FuncB = std::get<2>(B);
-          return IdsA.size() > IdsB.size() ||
-                 (IdsA.size() == IdsB.size() &&
-                  (IdsA < IdsB ||
-                   (IdsA == IdsB && FuncToIndex[FuncA] < FuncToIndex[FuncB])));
+          return A.StackIds.size() > B.StackIds.size() ||
+                 (A.StackIds.size() == B.StackIds.size() &&
+                  (A.StackIds < B.StackIds ||
+                   (A.StackIds == B.StackIds &&
+                    FuncToIndex[A.Func] < FuncToIndex[B.Func])));
         });
 
     // Find the node for the last stack id, which should be the same
@@ -1520,7 +1529,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
 #ifndef NDEBUG
       // If this call has a different set of ids than the last one, clear the
       // set used to ensure they are sorted properly.
-      if (I > 0 && Ids != std::get<1>(Calls[I - 1]))
+      if (I > 0 && Ids != Calls[I - 1].StackIds)
         MatchingIdsFuncSet.clear();
       else
         // If the prior call had the same stack ids this set would not be empty.
@@ -1607,17 +1616,18 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
       // assigned to the same context node, and skip them.
       bool DuplicateContextIds = false;
       for (unsigned J = I + 1; J < Calls.size(); J++) {
-        auto &NextIds = std::get<1>(Calls[J]);
+        auto &CallCtxInfo = Calls[J];
+        auto &NextIds = CallCtxInfo.StackIds;
         if (NextIds != Ids)
           break;
-        auto *NextFunc = std::get<2>(Calls[J]);
+        auto *NextFunc = CallCtxInfo.Func;
         if (NextFunc != Func) {
           // We have another Call with the same ids but that cannot share this
           // node, must duplicate ids for it.
           DuplicateContextIds = true;
           break;
         }
-        auto &NextCall = std::get<0>(Calls[J]);
+        auto &NextCall = CallCtxInfo.Call;
         CallToMatchingCall[NextCall] = Call;
         // Update I so that it gets incremented correctly to skip this call.
         I = J;

From d2f25e5405cce348913994db71a5efb0c1cf7f28 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 10 Sep 2024 16:38:26 -0700
Subject: [PATCH 037/114] [LegalizeTypes] Avoid creating an unused node in
 ExpandIntRes_ADDSUB. NFC

The Hi result is sometimes calculated a different way and this
node goes unused. Defer creation until we know for sure it is neeeded.

The test changes is because the node creation order changed the names
in the debug output.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp     | 7 ++++---
 .../update_llc_test_checks/Inputs/lanai_isel.ll.expected   | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a1cb74f43e605..2fa9e46eae506 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3496,7 +3496,6 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
 
   if (N->getOpcode() == ISD::ADD) {
     Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps);
-    Hi = DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2));
     SDValue Cmp;
     // Special case: X+1 has a carry out if X+1==0. This may reduce the live
     // range of X. We assume comparing with 0 is cheap.
@@ -3521,10 +3520,12 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
       Carry = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT),
                              DAG.getConstant(0, dl, NVT));
 
-    if (isAllOnesConstant(LoOps[1]) && isAllOnesConstant(HiOps[1]))
+    if (isAllOnesConstant(LoOps[1]) && isAllOnesConstant(HiOps[1])) {
       Hi = DAG.getNode(ISD::SUB, dl, NVT, HiOps[0], Carry);
-    else
+    } else {
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2));
       Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry);
+    }
   } else {
     Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps);
     Hi = DAG.getNode(ISD::SUB, dl, NVT, ArrayRef(HiOps, 2));
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected
index 71e82eca6c3e3..936efa378c1a4 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected
@@ -12,10 +12,10 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 ; CHECK-NEXT:    t24: i32 = ADD_R t5, t22, TargetConstant:i32<0>
 ; CHECK-NEXT:    t3: i32,ch = LDW_RI<Mem:(load (s32) from %fixed-stack.1, align 8)> TargetFrameIndex:i32<-1>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0
 ; CHECK-NEXT:    t19: i32,ch = LDW_RI<Mem:(dereferenceable load (s32) from %ir.loc, align 8)> TargetFrameIndex:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0
-; CHECK-NEXT:    t25: i32 = ADD_R t3, t19, TargetConstant:i32<0>
+; CHECK-NEXT:    t27: i32 = ADD_R t3, t19, TargetConstant:i32<0>
 ; CHECK-NEXT:    t30: i32,glue = SFSUB_F_RR t24, t5
 ; CHECK-NEXT:    t31: i32 = SCC TargetConstant:i32<4>, t30:1
-; CHECK-NEXT:    t28: i32 = ADD_R t25, t31, TargetConstant:i32<0>
+; CHECK-NEXT:    t28: i32 = ADD_R t27, t31, TargetConstant:i32<0>
 ; CHECK-NEXT:    t15: ch,glue = CopyToReg t0, Register:i32 $rv, t28
 ; CHECK-NEXT:    t17: ch,glue = CopyToReg t15, Register:i32 $r9, t24, t15:1
 ; CHECK-NEXT:    t18: ch = RET Register:i32 $rv, Register:i32 $r9, t17, t17:1

From db7e8f2ae81fe10170dc202e45ee8b784e75c74c Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 10 Sep 2024 17:02:15 -0700
Subject: [PATCH 038/114] [compiler-rt] Hardcode uptr/sptr typedefs on Linux
 Arm (#108105)

After #106155, Android arm32 asan builds stopped working with missing
definition linker errors. This is due to inconsistent definitions of
`uptr` of either `unsigned long` or `unsigned int` even between TUs in
compiler-rt. This is caused by Linux arm32 headers redefining
`__UINTPTR_TYPE__` (see `arch/arm/include/uapi/asm/types.h` in the Linux
kernel repo), meaning include order/whether or not the Linux header is
included changes compiler-rt symbol mangling.

As a workaround, this hardcodes `uptr`/`sptr` in compiler-rt to
`unsigned int`/`int` on Linux arm32, matching clang/gcc.
---
 compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
index f8f03454ea169..9208b12552ff5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
@@ -139,8 +139,14 @@
 namespace __sanitizer {
 
 #if defined(__UINTPTR_TYPE__)
+#  if defined(__arm__) && defined(__linux__)
+// Linux Arm headers redefine __UINTPTR_TYPE__ and disagree with clang/gcc.
+typedef unsigned int uptr;
+typedef int sptr;
+#  else
 typedef __UINTPTR_TYPE__ uptr;
 typedef __INTPTR_TYPE__ sptr;
+#  endif
 #elif defined(_WIN64)
 // 64-bit Windows uses LLP64 data model.
 typedef unsigned long long uptr;

From 6e854a6a01d310689a8b5d50126decd46b3880ea Mon Sep 17 00:00:00 2001
From: ChiaHungDuan <chiahungduan@google.com>
Date: Tue, 10 Sep 2024 17:46:02 -0700
Subject: [PATCH 039/114] [scudo] Fix the logic of MaxAllowedFragmentedPages
 (#107927)

MTE doesn't support MaxReleasedCachePages which may break the assumption
that only the first 4 pages will have memory tagged.
---
 compiler-rt/lib/scudo/standalone/secondary.h | 25 +++++++++++++-------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 1a232b9b9fb2d..c79ec1360b00a 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -72,13 +72,16 @@ namespace {
 struct CachedBlock {
   static constexpr u16 CacheIndexMax = UINT16_MAX;
   static constexpr u16 InvalidEntry = CacheIndexMax;
-  //   * MaxReleasedCachePages default is currently 4
-  //        - We arrived at this value after noticing that mapping
-  //        in larger memory regions performs better than releasing
-  //        memory and forcing a cache hit. According to the data,
-  //        it suggests that beyond 4 pages, the release execution time is
-  //        longer than the map execution time. In this way, the default
-  //        is dependent on the platform.
+  // We allow a certain amount of fragmentation and part of the fragmented bytes
+  // will be released by `releaseAndZeroPagesToOS()`. This increases the chance
+  // of cache hit rate and reduces the overhead to the RSS at the same time. See
+  // more details in the `MapAllocatorCache::retrieve()` section.
+  //
+  // We arrived at this default value after noticing that mapping in larger
+  // memory regions performs better than releasing memory and forcing a cache
+  // hit. According to the data, it suggests that beyond 4 pages, the release
+  // execution time is longer than the map execution time. In this way,
+  // the default is dependent on the platform.
   static constexpr uptr MaxReleasedCachePages = 4U;
 
   uptr CommitBase = 0;
@@ -725,8 +728,14 @@ MapAllocator<Config>::tryAllocateFromCache(const Options &Options, uptr Size,
   uptr EntryHeaderPos;
   uptr MaxAllowedFragmentedPages = MaxUnreleasedCachePages;
 
-  if (UNLIKELY(useMemoryTagging<Config>(Options)))
+  if (LIKELY(!useMemoryTagging<Config>(Options))) {
     MaxAllowedFragmentedPages += CachedBlock::MaxReleasedCachePages;
+  } else {
+    // TODO: Enable MaxReleasedCachePages may result in pages for an entry being
+    // partially released and it erases the tag of those pages as well. To
+    // support this feature for MTE, we need to tag those pages again.
+    DCHECK_EQ(CachedBlock::MaxReleasedCachePages, 0U);
+  }
 
   Entry = Cache.retrieve(MaxAllowedFragmentedPages, Size, Alignment,
                          getHeadersSize(), EntryHeaderPos);

From 68f31aaae95f9824e58001c7f9115034df51039e Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil2001@gmail.com>
Date: Wed, 11 Sep 2024 06:20:01 +0530
Subject: [PATCH 040/114] [ORC][Runtime] Add `dlupdate` for MachO (#97441)

With the help of @lhames, This pull request introduces the `dlupdate`
function in the ORC runtime. `dlupdate` enables incremental execution of
new initializers introduced in the REPL environment. Unlike traditional
`dlopen`, which manages initializers, code mapping, and library
reference counts, `dlupdate` focuses exclusively on running new
initializers.
---
 compiler-rt/lib/orc/dlfcn_wrapper.cpp         | 13 +++
 compiler-rt/lib/orc/macho_platform.cpp        | 86 +++++++++++++++++++
 compiler-rt/lib/orc/macho_platform.h          |  1 +
 llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h |  2 +
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp        | 27 +++++-
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp |  1 +
 6 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/orc/dlfcn_wrapper.cpp b/compiler-rt/lib/orc/dlfcn_wrapper.cpp
index fd9dce40d6738..bbbc79f607f27 100644
--- a/compiler-rt/lib/orc/dlfcn_wrapper.cpp
+++ b/compiler-rt/lib/orc/dlfcn_wrapper.cpp
@@ -20,6 +20,7 @@ using namespace orc_rt;
 
 extern "C" const char *__orc_rt_jit_dlerror();
 extern "C" void *__orc_rt_jit_dlopen(const char *path, int mode);
+extern "C" int __orc_rt_jit_dlupdate(void *dso_handle, int mode);
 extern "C" int __orc_rt_jit_dlclose(void *dso_handle);
 
 ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
@@ -41,6 +42,18 @@ __orc_rt_jit_dlopen_wrapper(const char *ArgData, size_t ArgSize) {
       .release();
 }
 
+#ifdef __APPLE__
+ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
+__orc_rt_jit_dlupdate_wrapper(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<int32_t(SPSExecutorAddr, int32_t)>::handle(
+             ArgData, ArgSize,
+             [](ExecutorAddr &DSOHandle, int32_t mode) {
+               return __orc_rt_jit_dlupdate(DSOHandle.toPtr<void *>(), mode);
+             })
+      .release();
+}
+#endif
+
 ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
 __orc_rt_jit_dlclose_wrapper(const char *ArgData, size_t ArgSize) {
   return WrapperFunction<int32_t(SPSExecutorAddr)>::handle(
diff --git a/compiler-rt/lib/orc/macho_platform.cpp b/compiler-rt/lib/orc/macho_platform.cpp
index c092545b2a367..8cc3594b5d0cf 100644
--- a/compiler-rt/lib/orc/macho_platform.cpp
+++ b/compiler-rt/lib/orc/macho_platform.cpp
@@ -331,6 +331,7 @@ class MachOPlatformRuntimeState {
 
   const char *dlerror();
   void *dlopen(std::string_view Name, int Mode);
+  int dlupdate(void *DSOHandle, int Mode);
   int dlclose(void *DSOHandle);
   void *dlsym(void *DSOHandle, const char *Symbol);
 
@@ -380,6 +381,12 @@ class MachOPlatformRuntimeState {
   Error dlopenInitialize(std::unique_lock<std::mutex> &JDStatesLock,
                          JITDylibState &JDS, MachOJITDylibDepInfoMap &DepInfo);
 
+  Error dlupdateImpl(void *DSOHandle, int Mode);
+  Error dlupdateFull(std::unique_lock<std::mutex> &JDStatesLock,
+                     JITDylibState &JDS);
+  Error dlupdateInitialize(std::unique_lock<std::mutex> &JDStatesLock,
+                           JITDylibState &JDS);
+
   Error dlcloseImpl(void *DSOHandle);
   Error dlcloseDeinitialize(std::unique_lock<std::mutex> &JDStatesLock,
                             JITDylibState &JDS);
@@ -789,6 +796,20 @@ void *MachOPlatformRuntimeState::dlopen(std::string_view Path, int Mode) {
   }
 }
 
+int MachOPlatformRuntimeState::dlupdate(void *DSOHandle, int Mode) {
+  ORC_RT_DEBUG({
+    std::string S;
+    printdbg("MachOPlatform::dlupdate(%p) (%s)\n", DSOHandle, S.c_str());
+  });
+  std::lock_guard<std::recursive_mutex> Lock(DyldAPIMutex);
+  if (auto Err = dlupdateImpl(DSOHandle, Mode)) {
+    // FIXME: Make dlerror thread safe.
+    DLFcnError = toString(std::move(Err));
+    return -1;
+  }
+  return 0;
+}
+
 int MachOPlatformRuntimeState::dlclose(void *DSOHandle) {
   ORC_RT_DEBUG({
     auto *JDS = getJITDylibStateByHeader(DSOHandle);
@@ -1244,6 +1265,67 @@ Error MachOPlatformRuntimeState::dlopenInitialize(
   return Error::success();
 }
 
+Error MachOPlatformRuntimeState::dlupdateImpl(void *DSOHandle, int Mode) {
+  std::unique_lock<std::mutex> Lock(JDStatesMutex);
+
+  // Try to find JITDylib state by DSOHandle.
+  auto *JDS = getJITDylibStateByHeader(DSOHandle);
+
+  if (!JDS) {
+    std::ostringstream ErrStream;
+    ErrStream << "No registered JITDylib for " << DSOHandle;
+    return make_error<StringError>(ErrStream.str());
+  }
+
+  if (!JDS->referenced())
+    return make_error<StringError>("dlupdate failed, JITDylib must be open.");
+
+  if (!JDS->Sealed) {
+    if (auto Err = dlupdateFull(Lock, *JDS))
+      return Err;
+  }
+
+  return Error::success();
+}
+
+Error MachOPlatformRuntimeState::dlupdateFull(
+    std::unique_lock<std::mutex> &JDStatesLock, JITDylibState &JDS) {
+  // Call back to the JIT to push the initializers.
+  Expected<MachOJITDylibDepInfoMap> DepInfo((MachOJITDylibDepInfoMap()));
+  // Unlock so that we can accept the initializer update.
+  JDStatesLock.unlock();
+  if (auto Err = WrapperFunction<SPSExpected<SPSMachOJITDylibDepInfoMap>(
+          SPSExecutorAddr)>::
+          call(JITDispatch(&__orc_rt_macho_push_initializers_tag), DepInfo,
+               ExecutorAddr::fromPtr(JDS.Header)))
+    return Err;
+  JDStatesLock.lock();
+
+  if (!DepInfo)
+    return DepInfo.takeError();
+
+  if (auto Err = dlupdateInitialize(JDStatesLock, JDS))
+    return Err;
+
+  return Error::success();
+}
+
+Error MachOPlatformRuntimeState::dlupdateInitialize(
+    std::unique_lock<std::mutex> &JDStatesLock, JITDylibState &JDS) {
+  ORC_RT_DEBUG({
+    printdbg("MachOPlatformRuntimeState::dlupdateInitialize(\"%s\")\n",
+             JDS.Name.c_str());
+  });
+
+  // Initialize this JITDylib.
+  if (auto Err = registerObjCRegistrationObjects(JDStatesLock, JDS))
+    return Err;
+  if (auto Err = runModInits(JDStatesLock, JDS))
+    return Err;
+
+  return Error::success();
+}
+
 Error MachOPlatformRuntimeState::dlcloseImpl(void *DSOHandle) {
   std::unique_lock<std::mutex> Lock(JDStatesMutex);
 
@@ -1517,6 +1599,10 @@ void *__orc_rt_macho_jit_dlopen(const char *path, int mode) {
   return MachOPlatformRuntimeState::get().dlopen(path, mode);
 }
 
+int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode) {
+  return MachOPlatformRuntimeState::get().dlupdate(dso_handle, mode);
+}
+
 int __orc_rt_macho_jit_dlclose(void *dso_handle) {
   return MachOPlatformRuntimeState::get().dlclose(dso_handle);
 }
diff --git a/compiler-rt/lib/orc/macho_platform.h b/compiler-rt/lib/orc/macho_platform.h
index 62234039437c0..ad70c97809d2f 100644
--- a/compiler-rt/lib/orc/macho_platform.h
+++ b/compiler-rt/lib/orc/macho_platform.h
@@ -24,6 +24,7 @@ ORC_RT_INTERFACE void __orc_rt_macho_cxa_finalize(void *dso_handle);
 // dlfcn functions.
 ORC_RT_INTERFACE const char *__orc_rt_macho_jit_dlerror();
 ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlopen(const char *path, int mode);
+ORC_RT_INTERFACE int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode);
 ORC_RT_INTERFACE int __orc_rt_macho_jit_dlclose(void *dso_handle);
 ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlsym(void *dso_handle,
                                                 const char *symbol);
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h b/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
index 3a71ddc88ce95..2660b9f74f405 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_LLJIT_H
 #define LLVM_EXECUTIONENGINE_ORC_LLJIT_H
 
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
@@ -620,6 +621,7 @@ class ORCPlatformSupport : public LLJIT::PlatformSupport {
 private:
   orc::LLJIT &J;
   DenseMap<orc::JITDylib *, orc::ExecutorAddr> DSOHandles;
+  SmallPtrSet<JITDylib const *, 8> InitializedDylib;
 };
 
 } // End namespace orc
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 2f9f4d33df017..19b3f3d6ea038 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -602,6 +602,7 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) {
   using llvm::orc::shared::SPSExecutorAddr;
   using llvm::orc::shared::SPSString;
   using SPSDLOpenSig = SPSExecutorAddr(SPSString, int32_t);
+  using SPSDLUpdateSig = int32_t(SPSExecutorAddr, int32_t);
   enum dlopen_mode : int32_t {
     ORC_RT_RTLD_LAZY = 0x1,
     ORC_RT_RTLD_NOW = 0x2,
@@ -612,9 +613,30 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) {
   auto &ES = J.getExecutionSession();
   auto MainSearchOrder = J.getMainJITDylib().withLinkOrderDo(
       [](const JITDylibSearchOrder &SO) { return SO; });
+  StringRef WrapperToCall = "__orc_rt_jit_dlopen_wrapper";
+  bool dlupdate = false;
+  if (ES.getTargetTriple().isOSBinFormatMachO()) {
+    if (InitializedDylib.contains(&JD)) {
+      WrapperToCall = "__orc_rt_jit_dlupdate_wrapper";
+      dlupdate = true;
+    } else
+      InitializedDylib.insert(&JD);
+  }
 
-  if (auto WrapperAddr = ES.lookup(
-          MainSearchOrder, J.mangleAndIntern("__orc_rt_jit_dlopen_wrapper"))) {
+  if (auto WrapperAddr =
+          ES.lookup(MainSearchOrder, J.mangleAndIntern(WrapperToCall))) {
+    if (dlupdate) {
+      int32_t result;
+      auto E = ES.callSPSWrapper<SPSDLUpdateSig>(WrapperAddr->getAddress(),
+                                                 result, DSOHandles[&JD],
+                                                 int32_t(ORC_RT_RTLD_LAZY));
+      if (E)
+        return E;
+      else if (result)
+        return make_error<StringError>("dlupdate failed",
+                                       inconvertibleErrorCode());
+      return Error::success();
+    }
     return ES.callSPSWrapper<SPSDLOpenSig>(WrapperAddr->getAddress(),
                                            DSOHandles[&JD], JD.getName(),
                                            int32_t(ORC_RT_RTLD_LAZY));
@@ -641,6 +663,7 @@ Error ORCPlatformSupport::deinitialize(orc::JITDylib &JD) {
       return make_error<StringError>("dlclose failed",
                                      inconvertibleErrorCode());
     DSOHandles.erase(&JD);
+    InitializedDylib.erase(&JD);
   } else
     return WrapperAddr.takeError();
   return Error::success();
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index a71afe1a3162f..e56d6b47799c0 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -428,6 +428,7 @@ MachOPlatform::standardRuntimeUtilityAliases() {
           {"___orc_rt_run_program", "___orc_rt_macho_run_program"},
           {"___orc_rt_jit_dlerror", "___orc_rt_macho_jit_dlerror"},
           {"___orc_rt_jit_dlopen", "___orc_rt_macho_jit_dlopen"},
+          {"___orc_rt_jit_dlupdate", "___orc_rt_macho_jit_dlupdate"},
           {"___orc_rt_jit_dlclose", "___orc_rt_macho_jit_dlclose"},
           {"___orc_rt_jit_dlsym", "___orc_rt_macho_jit_dlsym"},
           {"___orc_rt_log_error", "___orc_rt_log_error_to_stderr"}};

From 77fc8dae22ff1fa38c0271abc5521db76351f1fd Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 11 Sep 2024 09:13:23 +0800
Subject: [PATCH 041/114] [RISCV] Rematerialize vmv.v.x (#107993)

Even though vmv.v.x has a non constant scalar operand, we can still
rematerialize it because we have split register allocation between
vectors and scalars.

InlineSpiller will check to make sure that the scalar operand is live at
the point where the rematerialization occurs, so this won't extend any
scalar live ranges. However this also means we may not be able to
rematerialize in some cases, as shown in @vmv.v.x_needs_extended.

It might be worthwhile teaching InlineSpiller to extend scalar live
ranges in a future patch. I experimented with this locally and it
reduced spills on 531.deepsjeng_r by a further 3%.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |   1 +
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    |   1 +
 llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll       | 157 ++++++--------
 llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll        | 196 ++++++++----------
 llvm/test/CodeGen/RISCV/rvv/remat.ll          | 141 +++++++++++++
 5 files changed, 300 insertions(+), 196 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 325a50c9f48a1..2bb9df4ead0e9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -169,6 +169,7 @@ Register RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
 bool RISCVInstrInfo::isReallyTriviallyReMaterializable(
     const MachineInstr &MI) const {
   switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
+  case RISCV::VMV_V_X:
   case RISCV::VMV_V_I:
   case RISCV::VID_V:
     if (MI.getOperand(1).isUndef() &&
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index e11f176bfe604..c6cecb7d07182 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -2475,6 +2475,7 @@ multiclass VPseudoUnaryVMV_V_X_I {
         def "_V_" # mx : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                          SchedUnary<"WriteVIMovV", "ReadVIMovV", mx,
                                     forcePassthruRead=true>;
+        let isReMaterializable = 1 in
         def "_X_" # mx : VPseudoUnaryNoMask<m.vrclass, GPR>,
                          SchedUnary<"WriteVIMovX", "ReadVIMovX", mx,
                                     forcePassthruRead=true>;
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 01aac122d5957..7031f93edc2c3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -2022,14 +2022,9 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
-; RV32-NEXT:    vmv1r.v v24, v0
+; RV32-NEXT:    vmv1r.v v7, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -2045,7 +2040,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
@@ -2053,67 +2048,53 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    lui a3, 349525
 ; RV32-NEXT:    addi a3, a3, 1365
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a3, 61681
@@ -2121,25 +2102,26 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a3, 4112
 ; RV32-NEXT:    addi a3, a3, 257
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v16, a2, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
@@ -2149,8 +2131,8 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB46_2:
-; RV32-NEXT:    vmv1r.v v0, v24
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -2160,71 +2142,64 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -2386,23 +2361,23 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vsub.vv v24, v16, v24
+; RV32-NEXT:    vsub.vv v16, v16, v24
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v24, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v24, v16, v0
+; RV32-NEXT:    vsrl.vi v16, v16, 2
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vadd.vv v24, v16, v24
-; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    addi a3, a3, -241
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
@@ -2437,16 +2412,16 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vsub.vv v24, v8, v24
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index 0ef0a431dabc4..d36240e493e41 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -2266,7 +2266,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
@@ -2283,12 +2283,18 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
@@ -2297,55 +2303,51 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT:    lui a4, 209715
+; RV32-NEXT:    addi a4, a4, 819
+; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 209715
-; RV32-NEXT:    addi a4, a4, 819
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a4, 61681
@@ -2353,26 +2355,30 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a4, 4112
 ; RV32-NEXT:    addi a4, a4, 257
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v16, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    vsrl.vx v8, v16, a3, v0.t
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vsrl.vx v8, v8, a3, v0.t
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB46_2
 ; RV32-NEXT:  # %bb.1:
@@ -2382,40 +2388,32 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
@@ -2442,6 +2440,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
@@ -2449,12 +2448,6 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
@@ -2465,19 +2458,23 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 56
@@ -2608,15 +2605,9 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 5
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    sub a2, a0, a1
 ; RV32-NEXT:    sltu a3, a0, a2
@@ -2624,22 +2615,22 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    and a3, a3, a2
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v16, a2
+; RV32-NEXT:    vsub.vx v24, v16, a2
 ; RV32-NEXT:    vnot.v v16, v16
-; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsrl.vi v24, v16, 1
 ; RV32-NEXT:    lui a4, 349525
 ; RV32-NEXT:    addi a4, a4, 1365
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v0, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v24, v8
+; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    vsub.vv v16, v16, v24
 ; RV32-NEXT:    lui a4, 209715
 ; RV32-NEXT:    addi a4, a4, 819
@@ -2648,6 +2639,11 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v16, v0
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v0, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vadd.vv v16, v24, v16
 ; RV32-NEXT:    vsrl.vi v24, v16, 4
@@ -2655,50 +2651,46 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    addi a4, a4, -241
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v24, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    slli a4, a4, 3
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    lui a4, 4112
 ; RV32-NEXT:    addi a4, a4, 257
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.v.x v24, a4
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a3
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v16, v16, a3
 ; RV32-NEXT:    bltu a0, a1, .LBB47_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB47_2:
-; RV32-NEXT:    slli a1, a1, 5
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v24, a2
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vand.vv v8, v24, v8
+; RV32-NEXT:    vsub.vx v24, v8, a2
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v24, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v0
@@ -2706,23 +2698,17 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vx v8, v8, a3
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vmul.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vx v8, v8, a3
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll
index 2b12249378eb1..514612cd0525d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/remat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll
@@ -171,3 +171,144 @@ define void @vmv.v.i(ptr %p) {
   store volatile <vscale x 8 x i64> %vmv.v.i, ptr %p
   ret void
 }
+
+; The live range of %x needs extended down to the use of vmv.v.x at the end of
+; the block.
+define void @vmv.v.x_needs_extended(ptr %p, i64 %x) {
+; POSTRA-LABEL: vmv.v.x_needs_extended:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    addi sp, sp, -16
+; POSTRA-NEXT:    .cfi_def_cfa_offset 16
+; POSTRA-NEXT:    csrr a2, vlenb
+; POSTRA-NEXT:    slli a2, a2, 3
+; POSTRA-NEXT:    sub sp, sp, a2
+; POSTRA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; POSTRA-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; POSTRA-NEXT:    vmv.v.x v8, a1
+; POSTRA-NEXT:    addi a1, sp, 16
+; POSTRA-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vl8re64.v v16, (a0)
+; POSTRA-NEXT:    vl8re64.v v24, (a0)
+; POSTRA-NEXT:    vl8re64.v v0, (a0)
+; POSTRA-NEXT:    vl8re64.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v0, (a0)
+; POSTRA-NEXT:    vs8r.v v24, (a0)
+; POSTRA-NEXT:    vs8r.v v16, (a0)
+; POSTRA-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    csrr a0, vlenb
+; POSTRA-NEXT:    slli a0, a0, 3
+; POSTRA-NEXT:    add sp, sp, a0
+; POSTRA-NEXT:    addi sp, sp, 16
+; POSTRA-NEXT:    ret
+;
+; PRERA-LABEL: vmv.v.x_needs_extended:
+; PRERA:       # %bb.0:
+; PRERA-NEXT:    addi sp, sp, -16
+; PRERA-NEXT:    .cfi_def_cfa_offset 16
+; PRERA-NEXT:    csrr a2, vlenb
+; PRERA-NEXT:    slli a2, a2, 3
+; PRERA-NEXT:    sub sp, sp, a2
+; PRERA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; PRERA-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; PRERA-NEXT:    vmv.v.x v8, a1
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    addi a1, sp, 16
+; PRERA-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; PRERA-NEXT:    vl8re64.v v24, (a0)
+; PRERA-NEXT:    vl8re64.v v0, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v0, (a0)
+; PRERA-NEXT:    vs8r.v v24, (a0)
+; PRERA-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    csrr a0, vlenb
+; PRERA-NEXT:    slli a0, a0, 3
+; PRERA-NEXT:    add sp, sp, a0
+; PRERA-NEXT:    addi sp, sp, 16
+; PRERA-NEXT:    ret
+  %vmv.v.x = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(<vscale x 8 x i64> poison, i64 %x, i64 -1)
+  store volatile <vscale x 8 x i64> %vmv.v.x, ptr %p
+
+  %a = load volatile <vscale x 8 x i64>, ptr %p
+  %b = load volatile <vscale x 8 x i64>, ptr %p
+  %c = load volatile <vscale x 8 x i64>, ptr %p
+  %d = load volatile <vscale x 8 x i64>, ptr %p
+  store volatile <vscale x 8 x i64> %d, ptr %p
+  store volatile <vscale x 8 x i64> %c, ptr %p
+  store volatile <vscale x 8 x i64> %b, ptr %p
+  store volatile <vscale x 8 x i64> %a, ptr %p
+
+  store volatile <vscale x 8 x i64> %vmv.v.x, ptr %p
+  ret void
+}
+
+define void @vmv.v.x_live(ptr %p, i64 %x) {
+; POSTRA-LABEL: vmv.v.x_live:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; POSTRA-NEXT:    vmv.v.x v8, a1
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vl8re64.v v16, (a0)
+; POSTRA-NEXT:    vl8re64.v v24, (a0)
+; POSTRA-NEXT:    vl8re64.v v0, (a0)
+; POSTRA-NEXT:    vl8re64.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v0, (a0)
+; POSTRA-NEXT:    vs8r.v v24, (a0)
+; POSTRA-NEXT:    vs8r.v v16, (a0)
+; POSTRA-NEXT:    vmv.v.x v8, a1
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    sd a1, 0(a0)
+; POSTRA-NEXT:    ret
+;
+; PRERA-LABEL: vmv.v.x_live:
+; PRERA:       # %bb.0:
+; PRERA-NEXT:    addi sp, sp, -16
+; PRERA-NEXT:    .cfi_def_cfa_offset 16
+; PRERA-NEXT:    csrr a2, vlenb
+; PRERA-NEXT:    slli a2, a2, 3
+; PRERA-NEXT:    sub sp, sp, a2
+; PRERA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; PRERA-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; PRERA-NEXT:    vmv.v.x v8, a1
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    addi a2, sp, 16
+; PRERA-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; PRERA-NEXT:    vl8re64.v v24, (a0)
+; PRERA-NEXT:    vl8re64.v v0, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v0, (a0)
+; PRERA-NEXT:    vs8r.v v24, (a0)
+; PRERA-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    sd a1, 0(a0)
+; PRERA-NEXT:    csrr a0, vlenb
+; PRERA-NEXT:    slli a0, a0, 3
+; PRERA-NEXT:    add sp, sp, a0
+; PRERA-NEXT:    addi sp, sp, 16
+; PRERA-NEXT:    ret
+  %vmv.v.x = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(<vscale x 8 x i64> poison, i64 %x, i64 -1)
+  store volatile <vscale x 8 x i64> %vmv.v.x, ptr %p
+
+  %a = load volatile <vscale x 8 x i64>, ptr %p
+  %b = load volatile <vscale x 8 x i64>, ptr %p
+  %c = load volatile <vscale x 8 x i64>, ptr %p
+  %d = load volatile <vscale x 8 x i64>, ptr %p
+  store volatile <vscale x 8 x i64> %d, ptr %p
+  store volatile <vscale x 8 x i64> %c, ptr %p
+  store volatile <vscale x 8 x i64> %b, ptr %p
+  store volatile <vscale x 8 x i64> %a, ptr %p
+
+  store volatile <vscale x 8 x i64> %vmv.v.x, ptr %p
+  store volatile i64 %x, ptr %p
+  ret void
+}

From 69ed73380b9a1ec2d09d9b443a52b3ccd141334d Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Wed, 11 Sep 2024 09:17:05 +0800
Subject: [PATCH 042/114] [RISCV] Add testcase for -mcmodel= (#107816)

This is a pre-commit test for #107817
---
 clang/test/Driver/riscv-mcmodel.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 clang/test/Driver/riscv-mcmodel.c

diff --git a/clang/test/Driver/riscv-mcmodel.c b/clang/test/Driver/riscv-mcmodel.c
new file mode 100644
index 0000000000000..4f5fa95f59b66
--- /dev/null
+++ b/clang/test/Driver/riscv-mcmodel.c
@@ -0,0 +1,14 @@
+// RUN: %clang --target=riscv32 -### -c -mcmodel=small %s 2>&1 | FileCheck --check-prefix=SMALL %s
+// RUN: %clang --target=riscv64 -### -c -mcmodel=small %s 2>&1 | FileCheck --check-prefix=SMALL %s
+
+// RUN: %clang --target=riscv32 -### -c -mcmodel=medlow %s 2>&1 | FileCheck --check-prefix=SMALL %s
+// RUN: %clang --target=riscv64 -### -c -mcmodel=medlow %s 2>&1 | FileCheck --check-prefix=SMALL %s
+
+// RUN: %clang --target=riscv32 -### -c -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
+// RUN: %clang --target=riscv64 -### -c -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
+
+// RUN: %clang --target=riscv32 -### -c -mcmodel=medany %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
+// RUN: %clang --target=riscv64 -### -c -mcmodel=medany %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
+
+// SMALL: "-mcmodel=small"
+// MEDIUM: "-mcmodel=medium"

From c641b611f86a846a51763a54a196375aba3e6e4e Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq@debian.org>
Date: Wed, 11 Sep 2024 09:37:12 +0800
Subject: [PATCH 043/114] MIPSr6: Add llvm.is.fpclasss intrinsic support
 (#107857)

MIPSr6 has class.s/class.d instructions.
Let's use them for llvm.is.fpclass intrinsic.
---
 llvm/lib/Target/Mips/Mips32r6InstrInfo.td |  37 ++++
 llvm/lib/Target/Mips/MipsISelLowering.cpp |   5 +-
 llvm/lib/Target/Mips/MipsInstrInfo.h      |  17 ++
 llvm/test/CodeGen/Mips/is_fpclass.ll      | 246 ++++++++++++++++++++++
 4 files changed, 303 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/Mips/is_fpclass.ll

diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index 91ffbc4eb77dd..27b9ce60ba826 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -1139,6 +1139,43 @@ let AdditionalPredicates = [NotInMicroMips] in {
                 ISA_MIPS32R6;
 }
 
+// llvm.is_fpclass operations.
+def to_fclass_mask: SDNodeXForm<imm, [{
+  unsigned Check = N->getZExtValue();
+  unsigned Mask = 0;
+  if (Check & fcSNan)
+    Mask |= Mips::FClassMaskSignalingNaN;
+  if (Check & fcQNan)
+    Mask |= Mips::FClassMaskQuietNaN;
+  if (Check & fcPosInf)
+    Mask |= Mips::FClassMaskPositiveInfinity;
+  if (Check & fcNegInf)
+    Mask |= Mips::FClassMaskNegativeInfinity;
+  if (Check & fcPosNormal)
+    Mask |= Mips::FClassMaskPositiveNormal;
+  if (Check & fcNegNormal)
+    Mask |= Mips::FClassMaskNegativeNormal;
+  if (Check & fcPosSubnormal)
+    Mask |= Mips::FClassMaskPositiveSubnormal;
+  if (Check & fcNegSubnormal)
+    Mask |= Mips::FClassMaskNegativeSubnormal;
+  if (Check & fcPosZero)
+    Mask |= Mips::FClassMaskPositiveZero;
+  if (Check & fcNegZero)
+    Mask |= Mips::FClassMaskNegativeZero;
+  return CurDAG->getTargetConstant(Mask, SDLoc(N), MVT::i32);
+}]>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(is_fpclass f32:$lhs, i32:$imm),
+                (SLTu ZERO, (ANDi (MFC1 (CLASS_S f32:$lhs)),
+                          (to_fclass_mask imm:$imm)))>,
+                ISA_MIPS32R6;
+  def : MipsPat<(is_fpclass f64:$lhs, i32:$imm),
+                (SLTu ZERO, (ANDi (MFC1_D64 (CLASS_D f64:$lhs)),
+                          (to_fclass_mask imm:$imm)))>,
+                ISA_MIPS32R6;
+}
+
 // Pseudo instructions
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
     hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT], hasPostISelHook = 1 in {
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index fa57a3fa9b155..59f78a8ca306c 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -359,8 +359,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
 
-  // Lower fmin and fmax operations for MIPS R6.
-  // Instructions are defined but never used.
+  // Lower fmin/fmax/fclass operations for MIPS R6.
   if (Subtarget.hasMips32r6()) {
     setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
     setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
@@ -370,6 +369,8 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f64, Expand);
     setOperationAction(ISD::FMAXNUM, MVT::f64, Expand);
+    setOperationAction(ISD::IS_FPCLASS, MVT::f32, Legal);
+    setOperationAction(ISD::IS_FPCLASS, MVT::f64, Legal);
   } else {
     setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h
index dc4b9d99b39d2..4e039e0e32aba 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -213,6 +213,23 @@ class MipsInstrInfo : public MipsGenInstrInfo {
 const MipsInstrInfo *createMips16InstrInfo(const MipsSubtarget &STI);
 const MipsInstrInfo *createMipsSEInstrInfo(const MipsSubtarget &STI);
 
+namespace Mips {
+// Mask assignments for floating-point.
+enum FClassMask {
+  FClassMaskSignalingNaN = 1 << 0,
+  FClassMaskQuietNaN = 1 << 1,
+  FClassMaskNegativeInfinity = 1 << 2,
+  FClassMaskNegativeNormal = 1 << 3,
+  FClassMaskNegativeSubnormal = 1 << 4,
+  FClassMaskNegativeZero = 1 << 5,
+  FClassMaskPositiveInfinity = 1 << 6,
+  FClassMaskPositiveNormal = 1 << 7,
+  FClassMaskPositiveSubnormal = 1 << 8,
+  FClassMaskPositiveZero = 1 << 9
+};
+
+} // namespace Mips
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_MIPS_MIPSINSTRINFO_H
diff --git a/llvm/test/CodeGen/Mips/is_fpclass.ll b/llvm/test/CodeGen/Mips/is_fpclass.ll
new file mode 100644
index 0000000000000..9454a064c5312
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/is_fpclass.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=mipsisa32r6-unknown-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+
+
+define i1 @isnan_float(float %x) nounwind {
+; CHECK-LABEL: isnan_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 3
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3)  ; nan
+  ret i1 %1
+}
+
+define i1 @isnan_double(double %x) nounwind {
+; CHECK-LABEL: isnan_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 3
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3)  ; nan
+  ret i1 %1
+}
+
+define i1 @isnan_float_strictfp(float %x) strictfp nounwind {
+; CHECK-LABEL: isnan_float_strictfp:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 3
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3) strictfp ; nan
+  ret i1 %1
+}
+
+define i1 @isnan_double_strictfp(double %x) strictfp nounwind {
+; CHECK-LABEL: isnan_double_strictfp:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 3
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3) strictfp ; nan
+  ret i1 %1
+}
+
+define i1 @isinf_float(float %x) nounwind {
+; CHECK-LABEL: isinf_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 68
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; 0x204 = "inf"
+  ret i1 %1
+}
+
+define i1 @isfinite_float(float %x) nounwind {
+; CHECK-LABEL: isfinite_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 952
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; 0x1f8 = "finite"
+  ret i1 %1
+}
+
+define i1 @isnormal_float(float %x) nounwind {
+; CHECK-LABEL: isnormal_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 136
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 264)  ; 0x108 = "normal"
+  ret i1 %1
+}
+
+define i1 @issubnormal_float(float %x) nounwind {
+; CHECK-LABEL: issubnormal_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 272
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 144)  ; 0x90 = "subnormal"
+  ret i1 %1
+}
+
+define i1 @iszero_float(float %x) nounwind {
+; CHECK-LABEL: iszero_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 544
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 96)  ; 0x60 = "zero"
+  ret i1 %1
+}
+
+define i1 @issnan_float(float %x) nounwind {
+; CHECK-LABEL: issnan_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 1
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 1)
+  ret i1 %1
+}
+
+define i1 @issnan_double(double %x) nounwind {
+; CHECK-LABEL: issnan_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 1
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 1)
+  ret i1 %1
+}
+
+define i1 @isqnan_float(float %x) nounwind {
+; CHECK-LABEL: isqnan_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 2
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 2)
+  ret i1 %1
+}
+
+define i1 @isqnan_double(double %x) nounwind {
+; CHECK-LABEL: isqnan_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 2
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 2)
+  ret i1 %1
+}
+
+define i1 @isposzero_double(double %x) nounwind {
+; CHECK-LABEL: isposzero_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 512
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 64)
+  ret i1 %1
+}
+
+define i1 @isnegzero_double(double %x) nounwind {
+; CHECK-LABEL: isnegzero_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 32
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 32)
+  ret i1 %1
+}
+
+define i1 @isposnormal_double(double %x) nounwind {
+; CHECK-LABEL: isposnormal_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 128
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 256)
+  ret i1 %1
+}
+
+define i1 @isnegnormal_double(double %x) nounwind {
+; CHECK-LABEL: isnegnormal_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 8
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 8)
+  ret i1 %1
+}
+
+define i1 @isnormal_double(double %x) nounwind {
+; CHECK-LABEL: isnormal_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 136
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 264)
+  ret i1 %1
+}
+
+define i1 @isclass_00d_double(double %x) nounwind {
+; CHECK-LABEL: isclass_00d_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.d $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 13
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 13)
+  ret i1 %1
+}
+
+define i1 @isclass_1c0_float(float %x) nounwind {
+; CHECK-LABEL: isclass_1c0_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    class.s $f0, $f12
+; CHECK-NEXT:    mfc1 $1, $f0
+; CHECK-NEXT:    andi $1, $1, 896
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    sltu $2, $zero, $1
+  %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 448)
+  ret i1 %1
+}
+
+declare i1 @llvm.is.fpclass.f32(float, i32)
+declare i1 @llvm.is.fpclass.f64(double, i32)

From 21a0176c584c47218f20322641af8a855b8ce5e2 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 11 Sep 2024 09:38:29 +0800
Subject: [PATCH 044/114] [RISCV] Rematerialize vfmv.v.f (#108007)

This is the same principle as vmv.v.x in #107993, but for floats.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |  1 +
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    |  1 +
 llvm/test/CodeGen/RISCV/rvv/remat.ll          | 65 +++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2bb9df4ead0e9..a805c68e7795c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -170,6 +170,7 @@ bool RISCVInstrInfo::isReallyTriviallyReMaterializable(
     const MachineInstr &MI) const {
   switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
   case RISCV::VMV_V_X:
+  case RISCV::VFMV_V_F:
   case RISCV::VMV_V_I:
   case RISCV::VID_V:
     if (MI.getOperand(1).isUndef() &&
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index c6cecb7d07182..2eceef5066f77 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -6558,6 +6558,7 @@ defm PseudoVFMERGE : VPseudoVMRG_FM;
 //===----------------------------------------------------------------------===//
 // 13.16. Vector Floating-Point Move Instruction
 //===----------------------------------------------------------------------===//
+let isReMaterializable = 1 in
 defm PseudoVFMV_V : VPseudoVMV_F;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll
index 514612cd0525d..343b086898c14 100644
--- a/llvm/test/CodeGen/RISCV/rvv/remat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll
@@ -312,3 +312,68 @@ define void @vmv.v.x_live(ptr %p, i64 %x) {
   store volatile i64 %x, ptr %p
   ret void
 }
+
+define void @vfmv.v.f(ptr %p, double %x) {
+; POSTRA-LABEL: vfmv.v.f:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; POSTRA-NEXT:    vfmv.v.f v8, fa0
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vl8re64.v v16, (a0)
+; POSTRA-NEXT:    vl8re64.v v24, (a0)
+; POSTRA-NEXT:    vl8re64.v v0, (a0)
+; POSTRA-NEXT:    vl8re64.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v0, (a0)
+; POSTRA-NEXT:    vs8r.v v24, (a0)
+; POSTRA-NEXT:    vs8r.v v16, (a0)
+; POSTRA-NEXT:    vfmv.v.f v8, fa0
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    fsd fa0, 0(a0)
+; POSTRA-NEXT:    ret
+;
+; PRERA-LABEL: vfmv.v.f:
+; PRERA:       # %bb.0:
+; PRERA-NEXT:    addi sp, sp, -16
+; PRERA-NEXT:    .cfi_def_cfa_offset 16
+; PRERA-NEXT:    csrr a1, vlenb
+; PRERA-NEXT:    slli a1, a1, 3
+; PRERA-NEXT:    sub sp, sp, a1
+; PRERA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; PRERA-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; PRERA-NEXT:    vfmv.v.f v8, fa0
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    addi a1, sp, 16
+; PRERA-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; PRERA-NEXT:    vl8re64.v v24, (a0)
+; PRERA-NEXT:    vl8re64.v v0, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v0, (a0)
+; PRERA-NEXT:    vs8r.v v24, (a0)
+; PRERA-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    fsd fa0, 0(a0)
+; PRERA-NEXT:    csrr a0, vlenb
+; PRERA-NEXT:    slli a0, a0, 3
+; PRERA-NEXT:    add sp, sp, a0
+; PRERA-NEXT:    addi sp, sp, 16
+; PRERA-NEXT:    ret
+  %vfmv.v.f = call <vscale x 8 x double> @llvm.riscv.vfmv.v.f.nxv8f64(<vscale x 8 x double> poison, double %x, i64 -1)
+  store volatile <vscale x 8 x double> %vfmv.v.f, ptr %p
+
+  %a = load volatile <vscale x 8 x double>, ptr %p
+  %b = load volatile <vscale x 8 x double>, ptr %p
+  %c = load volatile <vscale x 8 x double>, ptr %p
+  %d = load volatile <vscale x 8 x double>, ptr %p
+  store volatile <vscale x 8 x double> %d, ptr %p
+  store volatile <vscale x 8 x double> %c, ptr %p
+  store volatile <vscale x 8 x double> %b, ptr %p
+  store volatile <vscale x 8 x double> %a, ptr %p
+
+  store volatile <vscale x 8 x double> %vfmv.v.f, ptr %p
+  store volatile double %x, ptr %p
+  ret void
+}

From 933fc63a1d230896bc09a08cf08dde4ac5b51703 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 11 Sep 2024 09:44:57 +0800
Subject: [PATCH 045/114] [RISCV] Rematerialize vmv.s.x and vfmv.s.f (#108012)

Continuing with #107993 and #108007, this handles the last of the main
rematerializable vector instructions.

There's an extra spill in one of the test cases, but it's likely noise
from the spill weights and isn't an issue in practice.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |   2 +
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    |   4 +-
 .../rvv/fixed-vectors-interleaved-access.ll   | 640 +++++++++---------
 llvm/test/CodeGen/RISCV/rvv/remat.ll          | 130 ++++
 4 files changed, 464 insertions(+), 312 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index a805c68e7795c..13212c2aea5dd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -172,6 +172,8 @@ bool RISCVInstrInfo::isReallyTriviallyReMaterializable(
   case RISCV::VMV_V_X:
   case RISCV::VFMV_V_F:
   case RISCV::VMV_V_I:
+  case RISCV::VMV_S_X:
+  case RISCV::VFMV_S_F:
   case RISCV::VID_V:
     if (MI.getOperand(1).isUndef() &&
         /* After RISCVInsertVSETVLI most pseudos will have implicit uses on vl
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 2eceef5066f77..430e09fd834ba 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -6764,7 +6764,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
     Pseudo<(outs GPR:$rd), (ins VR:$rs2, ixlenimm:$sew), []>,
     Sched<[WriteVMovXS, ReadVMovXS]>,
     RISCVVPseudo;
-  let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X,
+  let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1,
       Constraints = "$rd = $rs1" in
   def PseudoVMV_S_X: Pseudo<(outs VR:$rd),
                             (ins VR:$rs1, GPR:$rs2, AVL:$vl, ixlenimm:$sew),
@@ -6787,7 +6787,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
              (ins VR:$rs2, ixlenimm:$sew), []>,
       Sched<[WriteVMovFS, ReadVMovFS]>,
       RISCVVPseudo;
-    let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F,
+    let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1,
         Constraints = "$rd = $rs1" in
     def "PseudoVFMV_S_" # f.FX :
       Pseudo<(outs VR:$rd),
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index bc3e135a588a6..eff56e408d6d5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -159,296 +159,308 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 84
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 80 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 84 * vlenb
 ; RV32-NEXT:    addi a3, a1, 256
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v16, (a3)
+; RV32-NEXT:    vle32.v v8, (a3)
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 6
+; RV32-NEXT:    li a4, 76
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a1, 128
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vslideup.vi v8, v16, 4
+; RV32-NEXT:    vslideup.vi v4, v8, 4
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v4, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 12
 ; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 56
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v3, v0
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vslideup.vi v8, v16, 10, v0.t
+; RV32-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 16
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 44
+; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vslideup.vi v4, v8, 10, v0.t
 ; RV32-NEXT:    lui a4, %hi(.LCPI6_0)
 ; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v8, (a4)
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v0, (a4)
 ; RV32-NEXT:    lui a4, %hi(.LCPI6_1)
 ; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_1)
 ; RV32-NEXT:    lui a5, 1
 ; RV32-NEXT:    vle16.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a6, 24
+; RV32-NEXT:    li a6, 56
 ; RV32-NEXT:    mul a4, a4, a6
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 72
+; RV32-NEXT:    li a4, 68
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v24, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 60
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, a5, -64
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vmv.s.x v16, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v4
+; RV32-NEXT:    vs1r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
+; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v8, v16
+; RV32-NEXT:    vmv.v.v v4, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v12, v8, 2
-; RV32-NEXT:    vmv1r.v v8, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v3, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vl1r.v v1, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v12, v16, 8, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_3)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_3)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v0, (a1)
-; RV32-NEXT:    vle16.v v4, (a3)
+; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    vle16.v v8, (a3)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_4)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_4)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v10, (a1)
+; RV32-NEXT:    vle16.v v2, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v0
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v12
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 60
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v4, v0.t
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v8, v4, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v12, v24
+; RV32-NEXT:    vmv.v.v v8, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v24, v10
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v2
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v24, 6, v0.t
+; RV32-NEXT:    vslideup.vi v8, v24, 6, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a3, 44
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v12, (a1)
-; RV32-NEXT:    vle16.v v8, (a3)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v24, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
 ; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v8, a1
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v0, v12
-; RV32-NEXT:    vmv1r.v v3, v8
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    li a3, 60
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 28
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_8)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_8)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vle16.v v16, (a1)
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_9)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_9)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v4, (a3)
-; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    vle16.v v8, (a3)
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 2
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs4r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v24, v8
+; RV32-NEXT:    vrgatherei16.vv v12, v8, v16
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv4r.v v24, v16
 ; RV32-NEXT:    vslideup.vi v12, v16, 4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
-; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v20
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
+; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
@@ -461,48 +473,51 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a1, 15
 ; RV32-NEXT:    vmv.s.x v3, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v8, v16, 6
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v8, v24, 6
 ; RV32-NEXT:    vmv1r.v v0, v3
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v12, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v12, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv4r.v v24, v16
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_11)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_11)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_12)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_12)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v24, (a1)
+; RV32-NEXT:    vle16.v v28, (a1)
 ; RV32-NEXT:    vle16.v v4, (a3)
 ; RV32-NEXT:    li a1, 1008
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 60
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -511,14 +526,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_14)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_14)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v20, (a1)
+; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_15)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_15)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v24, (a3)
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vle16.v v28, (a3)
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 40
@@ -526,21 +541,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v20, v0.t
+; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a3, 44
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 28
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -548,20 +558,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    li a2, 60
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -570,56 +580,57 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v28, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    li a2, 76
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv.v.v v28, v0
+; RV32-NEXT:    vmv.v.v v24, v0
 ; RV32-NEXT:    vmv.v.v v16, v8
 ; RV32-NEXT:    addi a1, a0, 320
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vse32.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    vse32.v v28, (a1)
-; RV32-NEXT:    addi a1, a0, 192
 ; RV32-NEXT:    vse32.v v24, (a1)
+; RV32-NEXT:    addi a1, a0, 192
+; RV32-NEXT:    vse32.v v28, (a1)
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 44
+; RV32-NEXT:    li a2, 36
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 80
+; RV32-NEXT:    li a1, 84
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
@@ -630,15 +641,15 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 74
+; RV64-NEXT:    li a3, 66
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xca, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 74 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 66 * vlenb
 ; RV64-NEXT:    addi a2, a1, 256
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v16, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 25
+; RV64-NEXT:    li a3, 21
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
@@ -646,76 +657,85 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi a2, a1, 128
 ; RV64-NEXT:    vle64.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a3, a1, 6
-; RV64-NEXT:    add a1, a3, a1
+; RV64-NEXT:    li a3, 57
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vrgather.vi v12, v16, 4
 ; RV64-NEXT:    li a1, 128
-; RV64-NEXT:    vmv.s.x v8, a1
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v16, 8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 49
+; RV64-NEXT:    li a3, 37
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v8
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgather.vi v12, v16, 2, v0.t
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vid.v v10
 ; RV64-NEXT:    li a1, 6
-; RV64-NEXT:    vmul.vx v2, v10, a1
+; RV64-NEXT:    vmul.vx v8, v10, a1
 ; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vle64.v v16, (a2)
+; RV64-NEXT:    vle64.v v24, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 57
+; RV64-NEXT:    li a3, 45
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v7, a1
-; RV64-NEXT:    vadd.vi v10, v2, -16
+; RV64-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v10, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v2
-; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vs1r.v v10, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vadd.vi v10, v8, -16
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 57
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vrgatherei16.vv v16, v0, v8
+; RV64-NEXT:    vmv2r.v v4, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl1r.v v6, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v6
 ; RV64-NEXT:    vrgatherei16.vv v16, v24, v10, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
 ; RV64-NEXT:    vmv.v.v v12, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
+; RV64-NEXT:    li a2, 21
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v12, v16, 5
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vmv1r.v v6, v8
+; RV64-NEXT:    vrgather.vi v12, v8, 5
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vl1r.v v1, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 49
+; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -723,19 +743,19 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vrgather.vi v12, v16, 3, v0.t
 ; RV64-NEXT:    vmv.v.v v28, v12
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v24, v2, 1
-; RV64-NEXT:    vadd.vi v26, v2, -15
+; RV64-NEXT:    vadd.vi v24, v4, 1
+; RV64-NEXT:    vadd.vi v26, v4, -15
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64-NEXT:    vrgatherei16.vv v16, v8, v24
-; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vmv1r.v v0, v6
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 45
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -744,8 +764,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
 ; RV64-NEXT:    vmv.v.v v28, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 4
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
@@ -755,7 +775,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vmv.v.i v9, 6
 ; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
+; RV64-NEXT:    li a2, 21
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -763,259 +783,253 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v12, v16, v9
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgatherei16.vv v12, v16, v10
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv4r.v v8, v16
 ; RV64-NEXT:    vrgather.vi v12, v16, 2
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 5
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgather.vi v12, v16, 3
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a1, 24
-; RV64-NEXT:    vmv.s.x v1, a1
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v24, v2, 2
-; RV64-NEXT:    vadd.vi v4, v2, -14
+; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v24
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64-NEXT:    vadd.vi v16, v4, 2
+; RV64-NEXT:    vadd.vi v2, v4, -14
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 57
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
+; RV64-NEXT:    li a2, 45
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v16, v2, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v6
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 49
+; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v20, v16, 4, v0.t
+; RV64-NEXT:    vrgather.vi v28, v24, 4, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv2r.v v8, v4
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v4, v2, 3
-; RV64-NEXT:    vadd.vi v8, v2, -13
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs2r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vadd.vi v4, v4, 3
+; RV64-NEXT:    vadd.vi v6, v8, -13
+; RV64-NEXT:    vmv2r.v v2, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v4
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v4
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl2r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v16, v6, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v6
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 49
+; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v24, 5, v0.t
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgather.vi v4, v16, 5, v0.t
 ; RV64-NEXT:    lui a1, 96
 ; RV64-NEXT:    li a2, 192
-; RV64-NEXT:    vmv.s.x v28, a2
+; RV64-NEXT:    vmv.s.x v1, a2
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a1
-; RV64-NEXT:    vmv1r.v v0, v28
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 5
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v12, v24, v8, v0.t
+; RV64-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 5
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a1, 28
 ; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v30, v2, 4
-; RV64-NEXT:    vadd.vi v6, v2, -12
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
+; RV64-NEXT:    slli a2, a1, 3
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v30
+; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64-NEXT:    vadd.vi v22, v2, 4
+; RV64-NEXT:    vadd.vi v20, v2, -12
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 57
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v6, v0.t
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v22
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV64-NEXT:    lui a1, 112
 ; RV64-NEXT:    addi a1, a1, 1
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v12, a1
-; RV64-NEXT:    vmv1r.v v0, v28
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v12, v0.t
+; RV64-NEXT:    vrgatherei16.vv v20, v16, v12, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
-; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v16, v24
-; RV64-NEXT:    vmv2r.v v8, v2
+; RV64-NEXT:    vmv.v.v v12, v24
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vadd.vi v12, v2, 5
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v24, v0, v12
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v12
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v2, v8, -11
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vi v12, v2, -11
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v24, v8, v2, v0.t
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
+; RV64-NEXT:    li a2, 45
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
+; RV64-NEXT:    vmv4r.v v12, v4
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
 ; RV64-NEXT:    vmv.v.v v12, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 5
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v20, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v20, v0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
@@ -1028,24 +1042,30 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi a1, a0, 192
 ; RV64-NEXT:    vse64.v v12, (a1)
 ; RV64-NEXT:    addi a1, a0, 128
-; RV64-NEXT:    vse64.v v16, (a1)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 53
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 64
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 4
-; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    li a3, 13
+; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 74
+; RV64-NEXT:    li a1, 66
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll
index 343b086898c14..4f58ccb5188d3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/remat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll
@@ -377,3 +377,133 @@ define void @vfmv.v.f(ptr %p, double %x) {
   store volatile double %x, ptr %p
   ret void
 }
+
+define void @vmv.s.x(ptr %p, i64 %x) {
+; POSTRA-LABEL: vmv.s.x:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; POSTRA-NEXT:    vmv.s.x v8, a1
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vl8re64.v v16, (a0)
+; POSTRA-NEXT:    vl8re64.v v24, (a0)
+; POSTRA-NEXT:    vl8re64.v v0, (a0)
+; POSTRA-NEXT:    vl8re64.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v0, (a0)
+; POSTRA-NEXT:    vs8r.v v24, (a0)
+; POSTRA-NEXT:    vs8r.v v16, (a0)
+; POSTRA-NEXT:    vmv.s.x v8, a1
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    sd a1, 0(a0)
+; POSTRA-NEXT:    ret
+;
+; PRERA-LABEL: vmv.s.x:
+; PRERA:       # %bb.0:
+; PRERA-NEXT:    addi sp, sp, -16
+; PRERA-NEXT:    .cfi_def_cfa_offset 16
+; PRERA-NEXT:    csrr a2, vlenb
+; PRERA-NEXT:    slli a2, a2, 3
+; PRERA-NEXT:    sub sp, sp, a2
+; PRERA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; PRERA-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; PRERA-NEXT:    vmv.s.x v8, a1
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    addi a2, sp, 16
+; PRERA-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; PRERA-NEXT:    vl8re64.v v24, (a0)
+; PRERA-NEXT:    vl8re64.v v0, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v0, (a0)
+; PRERA-NEXT:    vs8r.v v24, (a0)
+; PRERA-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    sd a1, 0(a0)
+; PRERA-NEXT:    csrr a0, vlenb
+; PRERA-NEXT:    slli a0, a0, 3
+; PRERA-NEXT:    add sp, sp, a0
+; PRERA-NEXT:    addi sp, sp, 16
+; PRERA-NEXT:    ret
+  %vmv.s.x = call <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64> poison, i64 %x, i64 -1)
+  store volatile <vscale x 8 x i64> %vmv.s.x, ptr %p
+
+  %a = load volatile <vscale x 8 x i64>, ptr %p
+  %b = load volatile <vscale x 8 x i64>, ptr %p
+  %c = load volatile <vscale x 8 x i64>, ptr %p
+  %d = load volatile <vscale x 8 x i64>, ptr %p
+  store volatile <vscale x 8 x i64> %d, ptr %p
+  store volatile <vscale x 8 x i64> %c, ptr %p
+  store volatile <vscale x 8 x i64> %b, ptr %p
+  store volatile <vscale x 8 x i64> %a, ptr %p
+
+  store volatile <vscale x 8 x i64> %vmv.s.x, ptr %p
+  store volatile i64 %x, ptr %p
+  ret void
+}
+
+define void @vfmv.s.f(ptr %p, double %x) {
+; POSTRA-LABEL: vfmv.s.f:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; POSTRA-NEXT:    vfmv.s.f v8, fa0
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vl8re64.v v16, (a0)
+; POSTRA-NEXT:    vl8re64.v v24, (a0)
+; POSTRA-NEXT:    vl8re64.v v0, (a0)
+; POSTRA-NEXT:    vl8re64.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    vs8r.v v0, (a0)
+; POSTRA-NEXT:    vs8r.v v24, (a0)
+; POSTRA-NEXT:    vs8r.v v16, (a0)
+; POSTRA-NEXT:    vfmv.s.f v8, fa0
+; POSTRA-NEXT:    vs8r.v v8, (a0)
+; POSTRA-NEXT:    fsd fa0, 0(a0)
+; POSTRA-NEXT:    ret
+;
+; PRERA-LABEL: vfmv.s.f:
+; PRERA:       # %bb.0:
+; PRERA-NEXT:    addi sp, sp, -16
+; PRERA-NEXT:    .cfi_def_cfa_offset 16
+; PRERA-NEXT:    csrr a1, vlenb
+; PRERA-NEXT:    slli a1, a1, 3
+; PRERA-NEXT:    sub sp, sp, a1
+; PRERA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; PRERA-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; PRERA-NEXT:    vfmv.s.f v8, fa0
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    addi a1, sp, 16
+; PRERA-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; PRERA-NEXT:    vl8re64.v v24, (a0)
+; PRERA-NEXT:    vl8re64.v v0, (a0)
+; PRERA-NEXT:    vl8re64.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v0, (a0)
+; PRERA-NEXT:    vs8r.v v24, (a0)
+; PRERA-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; PRERA-NEXT:    vs8r.v v16, (a0)
+; PRERA-NEXT:    vs8r.v v8, (a0)
+; PRERA-NEXT:    fsd fa0, 0(a0)
+; PRERA-NEXT:    csrr a0, vlenb
+; PRERA-NEXT:    slli a0, a0, 3
+; PRERA-NEXT:    add sp, sp, a0
+; PRERA-NEXT:    addi sp, sp, 16
+; PRERA-NEXT:    ret
+  %vfmv.s.f = call <vscale x 8 x double> @llvm.riscv.vfmv.s.f.nxv8f64(<vscale x 8 x double> poison, double %x, i64 -1)
+  store volatile <vscale x 8 x double> %vfmv.s.f, ptr %p
+
+  %a = load volatile <vscale x 8 x double>, ptr %p
+  %b = load volatile <vscale x 8 x double>, ptr %p
+  %c = load volatile <vscale x 8 x double>, ptr %p
+  %d = load volatile <vscale x 8 x double>, ptr %p
+  store volatile <vscale x 8 x double> %d, ptr %p
+  store volatile <vscale x 8 x double> %c, ptr %p
+  store volatile <vscale x 8 x double> %b, ptr %p
+  store volatile <vscale x 8 x double> %a, ptr %p
+
+  store volatile <vscale x 8 x double> %vfmv.s.f, ptr %p
+  store volatile double %x, ptr %p
+  ret void
+}

From 901006f238aae8dd7e75d173bf9429e8e44f6385 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 18:45:06 -0700
Subject: [PATCH 046/114] =?UTF-8?q?[flang]=20Make=20flang=20module=20hidde?=
 =?UTF-8?q?n=20dependency=20explicit=20to=20correct=20build=E2=80=A6=20(#1?=
 =?UTF-8?q?08129)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… failure

Any flang module with a derived type definition implicitly depends on
flang/module/__fortran_type_info.f90. Make this dependency explicit so
that an unlucky build order doesn't cause a crash.
---
 flang/tools/f18/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index 344a781c41e95..4670362f7a103 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -72,9 +72,6 @@ if (NOT CMAKE_CROSSCOMPILING)
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_builtins.mod)
     else()
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_builtins.mod)
-      if(NOT ${filename} STREQUAL "__fortran_type_info")
-        set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_type_info.mod)
-      endif()
       if(${filename} STREQUAL "iso_fortran_env")
         set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/iso_fortran_env_impl.mod)
       endif()
@@ -83,6 +80,9 @@ if (NOT CMAKE_CROSSCOMPILING)
         set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_ieee_exceptions.mod)
       endif()
     endif()
+    if(NOT ${filename} STREQUAL "__fortran_type_info")
+      set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_type_info.mod)
+    endif()
 
     # The module contains PPC vector types that needs the PPC target.
     if(${filename} STREQUAL "__ppc_intrinsics" OR

From 12530015a45accd6cf9dd6d565c89b1d7e562be5 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 10 Sep 2024 18:47:20 -0700
Subject: [PATCH 047/114] [RISCV] Add reductions to list of roots in
 tryToReduceVL (#107595)

This allows us to reduce VLs feeding reduction instructions. In
particular, this means that <3 x Ty> reduce(load) like sequences no
longer require a VL toggle.

This was waiting on 3d72957; now that the latent correctness issue is
fixed, we can expand this transform.
---
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 18 ++++++
 .../redundant-copy-from-tail-duplicate.ll     |  2 +-
 .../rvv/fixed-vectors-reduction-formation.ll  | 49 ++++++----------
 .../CodeGen/RISCV/rvv/vreductions-fp-vp.ll    | 56 +++++++++----------
 4 files changed, 64 insertions(+), 61 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 298f3317bf61a..026e5d653c38c 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -145,6 +145,24 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
   case RISCV::VMERGE_VVM:
     SrcIdx = 3; // TODO: We can also handle the false operand.
     break;
+  case RISCV::VREDSUM_VS:
+  case RISCV::VREDMAXU_VS:
+  case RISCV::VREDMAX_VS:
+  case RISCV::VREDMINU_VS:
+  case RISCV::VREDMIN_VS:
+  case RISCV::VREDAND_VS:
+  case RISCV::VREDOR_VS:
+  case RISCV::VREDXOR_VS:
+  case RISCV::VWREDSUM_VS:
+  case RISCV::VWREDSUMU_VS:
+  case RISCV::VFREDUSUM_VS:
+  case RISCV::VFREDOSUM_VS:
+  case RISCV::VFREDMAX_VS:
+  case RISCV::VFREDMIN_VS:
+  case RISCV::VFWREDUSUM_VS:
+  case RISCV::VFWREDOSUM_VS:
+    SrcIdx = 2;
+    break;
   }
 
   MachineOperand &VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
diff --git a/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll b/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
index 3d367ddc59bca..5d588ad66b9ca 100644
--- a/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
+++ b/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
@@ -19,7 +19,7 @@ define signext i32 @sum(ptr %a, i32 signext %n, i1 %prof.min.iters.check, <vscal
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_4: # %vector.ph
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
 ; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index 6e5ab436fc02d..a8798474d669a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -121,10 +121,9 @@ define i32 @reduce_sum_16xi32_prefix2(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -160,10 +159,9 @@ define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -183,10 +181,9 @@ define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -208,10 +205,9 @@ define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -263,10 +259,9 @@ define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 9, e32, m4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetivli zero, 9, e32, m4, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -294,10 +289,9 @@ define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix13:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 13, e32, m4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetivli zero, 13, e32, m4, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -334,10 +328,9 @@ define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix14:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 14, e32, m4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetivli zero, 14, e32, m4, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -375,10 +368,9 @@ define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
 define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix15:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 15, e32, m4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetivli zero, 15, e32, m4, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -499,10 +491,9 @@ define i32 @reduce_xor_16xi32_prefix2(ptr %p) {
 define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_xor_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredxor.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -537,7 +528,7 @@ define i32 @reduce_and_16xi32_prefix2(ptr %p) {
 define i32 @reduce_and_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_and_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, -1
@@ -576,10 +567,9 @@ define i32 @reduce_or_16xi32_prefix2(ptr %p) {
 define i32 @reduce_or_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_or_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredor.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -619,11 +609,10 @@ define i32 @reduce_smax_16xi32_prefix2(ptr %p) {
 define i32 @reduce_smax_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_smax_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a0, 524288
 ; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredmax.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -658,12 +647,11 @@ define i32 @reduce_smin_16xi32_prefix2(ptr %p) {
 define i32 @reduce_smin_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_smin_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a0, 524288
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredmin.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -698,10 +686,9 @@ define i32 @reduce_umax_16xi32_prefix2(ptr %p) {
 define i32 @reduce_umax_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_umax_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredmaxu.vs v8, v8, v10
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -736,7 +723,7 @@ define i32 @reduce_umin_16xi32_prefix2(ptr %p) {
 define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
 ; RV32-LABEL: reduce_umin_16xi32_prefix5:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, -1
@@ -747,11 +734,10 @@ define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
 ;
 ; RV64-LABEL: reduce_umin_16xi32_prefix5:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    li a0, -1
 ; RV64-NEXT:    vmv.s.x v10, a0
-; RV64-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV64-NEXT:    vredminu.vs v8, v8, v10
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
@@ -787,11 +773,10 @@ define float @reduce_fadd_16xf32_prefix2(ptr %p) {
 define float @reduce_fadd_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_fadd_16xi32_prefix5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a0, 524288
 ; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v8, v8, v10
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index 7f2e3cdbfd0e3..7d78fa5a8f3ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -22,10 +22,10 @@ define half @vpreduce_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1
 ;
 ; ZVFHMIN-LABEL: vpreduce_fadd_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v9, v8, v0.t
@@ -48,10 +48,10 @@ define half @vpreduce_ord_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale
 ;
 ; ZVFHMIN-LABEL: vpreduce_ord_fadd_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v9, v8, v0.t
@@ -76,10 +76,10 @@ define half @vpreduce_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2
 ;
 ; ZVFHMIN-LABEL: vpreduce_fadd_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v9, v8, v0.t
@@ -102,10 +102,10 @@ define half @vpreduce_ord_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale
 ;
 ; ZVFHMIN-LABEL: vpreduce_ord_fadd_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v9, v8, v0.t
@@ -130,10 +130,10 @@ define half @vpreduce_fadd_nxv4f16(half %s, <vscale x 4 x half> %v, <vscale x 4
 ;
 ; ZVFHMIN-LABEL: vpreduce_fadd_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v10, v8, v0.t
@@ -156,10 +156,10 @@ define half @vpreduce_ord_fadd_nxv4f16(half %s, <vscale x 4 x half> %v, <vscale
 ;
 ; ZVFHMIN-LABEL: vpreduce_ord_fadd_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v10, v8, v0.t
@@ -233,10 +233,10 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a0, a4
 ; ZVFHMIN-NEXT:  .LBB6_6:
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v24, v8, v0.t
@@ -245,20 +245,20 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
@@ -267,9 +267,9 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa0, fa5
@@ -339,10 +339,10 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a0, a4
 ; ZVFHMIN-NEXT:  .LBB7_6:
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v24, v8, v0.t
@@ -351,20 +351,20 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
@@ -373,9 +373,9 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa0, fa5

From 5773adb0bfb72249fde00f1e4a02dc3f583d41c3 Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq@debian.org>
Date: Wed, 11 Sep 2024 09:53:04 +0800
Subject: [PATCH 048/114] SelectionDAG: Remove unneeded getSelectCC in
 expandFMINIMUMNUM_FMAXIMUMNUM (#107416)

ISD::FCANONICALIZE is enough, which can process NaN or non-NaN
correctly, thus getSelectCC is not needed here.
---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b3307dc9b7730..03010c1df0014 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8616,10 +8616,7 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
   // If MinMax is NaN, let's quiet it.
   if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(LHS) &&
       !DAG.isKnownNeverNaN(RHS)) {
-    SDValue MinMaxQuiet =
-        DAG.getNode(ISD::FCANONICALIZE, DL, VT, MinMax, Flags);
-    MinMax =
-        DAG.getSelectCC(DL, MinMax, MinMax, MinMaxQuiet, MinMax, ISD::SETUO);
+    MinMax = DAG.getNode(ISD::FCANONICALIZE, DL, VT, MinMax, Flags);
   }
 
   // Fixup signed zero behavior.

From 76151c449080b7239c8b442291514a4300d51cba Mon Sep 17 00:00:00 2001
From: ChiaHungDuan <chiahungduan@google.com>
Date: Tue, 10 Sep 2024 18:56:49 -0700
Subject: [PATCH 049/114] Revert "[scudo] Fix the logic of
 MaxAllowedFragmentedPages" (#108130)

Reverts llvm/llvm-project#107927

We are supposed to check the MaxAllowedFragmentedPages instead.
---
 compiler-rt/lib/scudo/standalone/secondary.h | 25 +++++++-------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index c79ec1360b00a..1a232b9b9fb2d 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -72,16 +72,13 @@ namespace {
 struct CachedBlock {
   static constexpr u16 CacheIndexMax = UINT16_MAX;
   static constexpr u16 InvalidEntry = CacheIndexMax;
-  // We allow a certain amount of fragmentation and part of the fragmented bytes
-  // will be released by `releaseAndZeroPagesToOS()`. This increases the chance
-  // of cache hit rate and reduces the overhead to the RSS at the same time. See
-  // more details in the `MapAllocatorCache::retrieve()` section.
-  //
-  // We arrived at this default value after noticing that mapping in larger
-  // memory regions performs better than releasing memory and forcing a cache
-  // hit. According to the data, it suggests that beyond 4 pages, the release
-  // execution time is longer than the map execution time. In this way,
-  // the default is dependent on the platform.
+  //   * MaxReleasedCachePages default is currently 4
+  //        - We arrived at this value after noticing that mapping
+  //        in larger memory regions performs better than releasing
+  //        memory and forcing a cache hit. According to the data,
+  //        it suggests that beyond 4 pages, the release execution time is
+  //        longer than the map execution time. In this way, the default
+  //        is dependent on the platform.
   static constexpr uptr MaxReleasedCachePages = 4U;
 
   uptr CommitBase = 0;
@@ -728,14 +725,8 @@ MapAllocator<Config>::tryAllocateFromCache(const Options &Options, uptr Size,
   uptr EntryHeaderPos;
   uptr MaxAllowedFragmentedPages = MaxUnreleasedCachePages;
 
-  if (LIKELY(!useMemoryTagging<Config>(Options))) {
+  if (UNLIKELY(useMemoryTagging<Config>(Options)))
     MaxAllowedFragmentedPages += CachedBlock::MaxReleasedCachePages;
-  } else {
-    // TODO: Enable MaxReleasedCachePages may result in pages for an entry being
-    // partially released and it erases the tag of those pages as well. To
-    // support this feature for MTE, we need to tag those pages again.
-    DCHECK_EQ(CachedBlock::MaxReleasedCachePages, 0U);
-  }
 
   Entry = Cache.retrieve(MaxAllowedFragmentedPages, Size, Alignment,
                          getHeadersSize(), EntryHeaderPos);

From c5711130be758ca82216abb81c6b870a830e8f82 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Sep 2024 19:00:32 -0700
Subject: [PATCH 050/114] [flang] Fix cycle of build dependencies (#108132)

While trying to fix one build problem, I made things worse. This should
clear things up.
---
 flang/tools/f18/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index 4670362f7a103..9d7b8633958cb 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -80,7 +80,7 @@ if (NOT CMAKE_CROSSCOMPILING)
         set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_ieee_exceptions.mod)
       endif()
     endif()
-    if(NOT ${filename} STREQUAL "__fortran_type_info")
+    if(NOT ${filename} STREQUAL "__fortran_type_info" AND NOT ${filename} STREQUAL "__fortran_builtins")
       set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_type_info.mod)
     endif()
 

From 3b4e7c9c4502d41ece4ef3431bbc12f055adabb5 Mon Sep 17 00:00:00 2001
From: Sterling-Augustine
 <56981066+Sterling-Augustine@users.noreply.github.com>
Date: Wed, 11 Sep 2024 02:11:14 +0000
Subject: [PATCH 051/114] [SandboxIR] Implement ScalableVectorType (#108124)

As in the heading.
---
 llvm/include/llvm/SandboxIR/Type.h     | 83 +++++++++++++++++++++-----
 llvm/lib/SandboxIR/Type.cpp            |  6 ++
 llvm/unittests/SandboxIR/TypesTest.cpp | 59 ++++++++++++++++++
 3 files changed, 133 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h
index ec141c249fb21..a2ac9e014b44a 100644
--- a/llvm/include/llvm/SandboxIR/Type.h
+++ b/llvm/include/llvm/SandboxIR/Type.h
@@ -26,6 +26,7 @@ class Context;
 class PointerType;
 class VectorType;
 class FixedVectorType;
+class ScalableVectorType;
 class IntegerType;
 class FunctionType;
 class ArrayType;
@@ -39,21 +40,22 @@ class StructType;
 class Type {
 protected:
   llvm::Type *LLVMTy;
-  friend class ArrayType;      // For LLVMTy.
-  friend class StructType;     // For LLVMTy.
-  friend class VectorType;     // For LLVMTy.
-  friend class FixedVectorType; // For LLVMTy.
-  friend class PointerType;    // For LLVMTy.
-  friend class FunctionType;   // For LLVMTy.
-  friend class IntegerType;    // For LLVMTy.
-  friend class Function;       // For LLVMTy.
-  friend class CallBase;       // For LLVMTy.
-  friend class ConstantInt;    // For LLVMTy.
-  friend class ConstantArray;  // For LLVMTy.
-  friend class ConstantStruct; // For LLVMTy.
-  friend class ConstantVector; // For LLVMTy.
-  friend class CmpInst;        // For LLVMTy. TODO: Cleanup after
-                               // sandboxir::VectorType is more complete.
+  friend class ArrayType;          // For LLVMTy.
+  friend class StructType;         // For LLVMTy.
+  friend class VectorType;         // For LLVMTy.
+  friend class FixedVectorType;    // For LLVMTy.
+  friend class ScalableVectorType; // For LLVMTy.
+  friend class PointerType;        // For LLVMTy.
+  friend class FunctionType;       // For LLVMTy.
+  friend class IntegerType;        // For LLVMTy.
+  friend class Function;           // For LLVMTy.
+  friend class CallBase;           // For LLVMTy.
+  friend class ConstantInt;        // For LLVMTy.
+  friend class ConstantArray;      // For LLVMTy.
+  friend class ConstantStruct;     // For LLVMTy.
+  friend class ConstantVector;     // For LLVMTy.
+  friend class CmpInst;            // For LLVMTy. TODO: Cleanup after
+                                   // sandboxir::VectorType is more complete.
 
   // Friend all instruction classes because `create()` functions use LLVMTy.
 #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS;
@@ -390,6 +392,57 @@ class FixedVectorType : public VectorType {
   }
 };
 
+class ScalableVectorType : public VectorType {
+public:
+  static ScalableVectorType *get(Type *ElementType, unsigned MinNumElts);
+
+  static ScalableVectorType *get(Type *ElementType,
+                                 const ScalableVectorType *SVTy) {
+    return get(ElementType, SVTy->getMinNumElements());
+  }
+
+  static ScalableVectorType *getInteger(ScalableVectorType *VTy) {
+    return cast<ScalableVectorType>(VectorType::getInteger(VTy));
+  }
+
+  static ScalableVectorType *
+  getExtendedElementVectorType(ScalableVectorType *VTy) {
+    return cast<ScalableVectorType>(
+        VectorType::getExtendedElementVectorType(VTy));
+  }
+
+  static ScalableVectorType *
+  getTruncatedElementVectorType(ScalableVectorType *VTy) {
+    return cast<ScalableVectorType>(
+        VectorType::getTruncatedElementVectorType(VTy));
+  }
+
+  static ScalableVectorType *getSubdividedVectorType(ScalableVectorType *VTy,
+                                                     int NumSubdivs) {
+    return cast<ScalableVectorType>(
+        VectorType::getSubdividedVectorType(VTy, NumSubdivs));
+  }
+
+  static ScalableVectorType *
+  getHalfElementsVectorType(ScalableVectorType *VTy) {
+    return cast<ScalableVectorType>(VectorType::getHalfElementsVectorType(VTy));
+  }
+
+  static ScalableVectorType *
+  getDoubleElementsVectorType(ScalableVectorType *VTy) {
+    return cast<ScalableVectorType>(
+        VectorType::getDoubleElementsVectorType(VTy));
+  }
+
+  unsigned getMinNumElements() const {
+    return cast<llvm::ScalableVectorType>(LLVMTy)->getMinNumElements();
+  }
+
+  static bool classof(const Type *T) {
+    return isa<llvm::ScalableVectorType>(T->LLVMTy);
+  }
+};
+
 class FunctionType : public Type {
 public:
   // TODO: add missing functions
diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp
index 26aa8b3743084..87dcb726dde35 100644
--- a/llvm/lib/SandboxIR/Type.cpp
+++ b/llvm/lib/SandboxIR/Type.cpp
@@ -108,6 +108,12 @@ FixedVectorType *FixedVectorType::get(Type *ElementType, unsigned NumElts) {
       llvm::FixedVectorType::get(ElementType->LLVMTy, NumElts)));
 }
 
+ScalableVectorType *ScalableVectorType::get(Type *ElementType,
+                                            unsigned NumElts) {
+  return cast<ScalableVectorType>(ElementType->getContext().getType(
+      llvm::ScalableVectorType::get(ElementType->LLVMTy, NumElts)));
+}
+
 IntegerType *IntegerType::get(Context &Ctx, unsigned NumBits) {
   return cast<IntegerType>(
       Ctx.getType(llvm::IntegerType::get(Ctx.LLVMCtx, NumBits)));
diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp
index 3564ae6683014..40aa32fb08ed0 100644
--- a/llvm/unittests/SandboxIR/TypesTest.cpp
+++ b/llvm/unittests/SandboxIR/TypesTest.cpp
@@ -381,6 +381,65 @@ define void @foo(<4 x i16> %vi0, <4 x float> %vf1, i8 %i0) {
   EXPECT_EQ(Vec8i16Ty->getElementCount(), ElementCount::getFixed(8));
 }
 
+TEST_F(SandboxTypeTest, ScalableVectorType) {
+  parseIR(C, R"IR(
+define void @foo(<vscale x 4 x i16> %vi0, <vscale x 4 x float> %vf1, i8 %i0) {
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  // Check classof(), creation, accessors
+  auto *Vec4i16Ty =
+      cast<sandboxir::ScalableVectorType>(F->getArg(0)->getType());
+  EXPECT_TRUE(Vec4i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec4i16Ty->getMinNumElements(), 4u);
+
+  // get(ElementType, NumElements)
+  EXPECT_EQ(
+      sandboxir::ScalableVectorType::get(sandboxir::Type::getInt16Ty(Ctx), 4),
+      F->getArg(0)->getType());
+  // get(ElementType, Other)
+  EXPECT_EQ(sandboxir::ScalableVectorType::get(
+                sandboxir::Type::getInt16Ty(Ctx),
+                cast<sandboxir::ScalableVectorType>(F->getArg(0)->getType())),
+            F->getArg(0)->getType());
+  auto *Vec4FTy = cast<sandboxir::ScalableVectorType>(F->getArg(1)->getType());
+  EXPECT_TRUE(Vec4FTy->getElementType()->isFloatTy());
+  // getInteger
+  auto *Vec4i32Ty = sandboxir::ScalableVectorType::getInteger(Vec4FTy);
+  EXPECT_TRUE(Vec4i32Ty->getElementType()->isIntegerTy(32));
+  EXPECT_EQ(Vec4i32Ty->getMinNumElements(), Vec4FTy->getMinNumElements());
+  // getExtendedElementCountVectorType
+  auto *Vec4i64Ty =
+      sandboxir::ScalableVectorType::getExtendedElementVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec4i64Ty->getElementType()->isIntegerTy(32));
+  EXPECT_EQ(Vec4i64Ty->getMinNumElements(), Vec4i16Ty->getMinNumElements());
+  // getTruncatedElementVectorType
+  auto *Vec4i8Ty =
+      sandboxir::ScalableVectorType::getTruncatedElementVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec4i8Ty->getElementType()->isIntegerTy(8));
+  EXPECT_EQ(Vec4i8Ty->getMinNumElements(), Vec4i8Ty->getMinNumElements());
+  // getSubdividedVectorType
+  auto *Vec8i8Ty =
+      sandboxir::ScalableVectorType::getSubdividedVectorType(Vec4i16Ty, 1);
+  EXPECT_TRUE(Vec8i8Ty->getElementType()->isIntegerTy(8));
+  EXPECT_EQ(Vec8i8Ty->getMinNumElements(), 8u);
+  // getMinNumElements
+  EXPECT_EQ(Vec8i8Ty->getMinNumElements(), 8u);
+  // getHalfElementsVectorType
+  auto *Vec2i16Ty =
+      sandboxir::ScalableVectorType::getHalfElementsVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec2i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec2i16Ty->getMinNumElements(), 2u);
+  // getDoubleElementsVectorType
+  auto *Vec8i16Ty =
+      sandboxir::ScalableVectorType::getDoubleElementsVectorType(Vec4i16Ty);
+  EXPECT_TRUE(Vec8i16Ty->getElementType()->isIntegerTy(16));
+  EXPECT_EQ(Vec8i16Ty->getMinNumElements(), 8u);
+}
+
 TEST_F(SandboxTypeTest, FunctionType) {
   parseIR(C, R"IR(
 define void @foo() {

From e67a6667dc2b46ece983321af89ae40ca7986b16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 10 Sep 2024 19:33:33 -0700
Subject: [PATCH 052/114] [flang][cuda] Avoid extra load in c_f_pointer
 lowering with c_devptr (#108090)

Remove unnecessary load of the `cptr` component when getting the
`__address`. `fir.coordinate_of` operation can be chained so the load is
not needed.
---
 flang/lib/Optimizer/Builder/FIRBuilder.cpp | 3 +--
 flang/test/Lower/CUDA/cuda-devptr.cuf      | 7 ++++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index c5a135a189e8d..d786d79ba8701 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -1594,8 +1594,7 @@ mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
       cPtrCoor = builder.create<fir::ExtractValueOp>(loc, addrFieldTy, cPtr,
                                                      arrayAttr);
     }
-    mlir::Value cptr = builder.create<fir::LoadOp>(loc, cPtrCoor);
-    return genCPtrOrCFunptrValue(builder, loc, cptr);
+    return genCPtrOrCFunptrValue(builder, loc, cPtrCoor);
   }
 
   if (fir::isa_ref_type(cPtr.getType())) {
diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf
index 21c5088b640fc..2eac890970d52 100644
--- a/flang/test/Lower/CUDA/cuda-devptr.cuf
+++ b/flang/test/Lower/CUDA/cuda-devptr.cuf
@@ -40,8 +40,9 @@ end
 ! CHECK: %[[X:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK: %[[CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
 ! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}#1, %[[CPTR]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
-! CHECK: %[[CPTR_LOAD:.*]] = fir.load %[[CPTR_COORD]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
-! CHECK: %[[ADDRESS:.*]] = fir.extract_value %[[CPTR_LOAD]], [0 : index] : (!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) -> i64
-! CHECK: %[[ADDRESS_IDX:.*]] = fir.convert %[[ADDRESS]] : (i64) -> !fir.ptr<!fir.array<?xf32>>
+! CHECK: %[[ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[ADDRESS_COORD:.*]] = fir.coordinate_of %[[CPTR_COORD]], %[[ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS_LOADED:.*]] = fir.load %[[ADDRESS_COORD]] : !fir.ref<i64>
+! CHECK: %[[ADDRESS_IDX:.*]] = fir.convert %[[ADDRESS_LOADED]] : (i64) -> !fir.ptr<!fir.array<?xf32>>
 ! CHECK: %[[EMBOX:.*]] = fir.embox %[[ADDRESS_IDX]](%{{.*}}) : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
 ! CHECK: fir.store %[[EMBOX]] to %[[X]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>

From 3dad29b677e427bf69c035605a16efd065576829 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 10 Sep 2024 19:36:04 -0700
Subject: [PATCH 053/114] [LTO] Remove unused includes (NFC) (#108110)

clangd reports these as unused headers.  My manual inspection agrees
with the findings.
---
 llvm/include/llvm/LTO/LTO.h                       | 1 -
 llvm/include/llvm/Transforms/IPO/FunctionImport.h | 2 --
 llvm/lib/LTO/LTO.cpp                              | 1 -
 llvm/lib/Transforms/IPO/FunctionImport.cpp        | 1 -
 4 files changed, 5 deletions(-)

diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index fc6e93606de12..214aa4e1c562d 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -19,7 +19,6 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/LTO/Config.h"
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index 0a6cc5951b706..70739709a810a 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -17,9 +17,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Error.h"
 #include <functional>
-#include <map>
 #include <memory>
-#include <string>
 #include <system_error>
 #include <utility>
 
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 5d9a5cbd18f15..a88124dacfaef 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -38,7 +38,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SHA1.h"
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 1aac8e0713587..ff0d78178bd18 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -46,7 +46,6 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <cassert>
 #include <memory>
-#include <set>
 #include <string>
 #include <system_error>
 #include <tuple>

From 6bbf7f06d8e4e84bbda9027252b26a0d9ae10cde Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 10 Sep 2024 21:32:24 -0700
Subject: [PATCH 054/114] [WebAssembly] Add assembly support for final EH
 proposal (#107917)

This adds the basic assembly generation support for the final EH
proposal, which was newly adopted in Sep 2023 and advanced into Phase 4
in Jul 2024:

https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md

This adds support for the generation of new `try_table` and `throw_ref`
instruction in .s asesmbly format. This does NOT yet include
- Block annotation comment generation for .s format
- .o object file generation
- .s assembly parsing
- Type checking (AsmTypeCheck)
- Disassembler
- Fixing unwind mismatches in CFGStackify

These will be added as follow-up PRs.

---

The format for `TRY_TABLE`, both for `MachineInstr` and `MCInst`, is as
follows:
```
TRY_TABLE type number_of_catches catch_clauses*
```
where `catch_clause` is
```
catch_opcode tag+ destination
```
`catch_opcode` should be one of 0/1/2/3, which denotes
`CATCH`/`CATCH_REF`/`CATCH_ALL`/`CATCH_ALL_REF` respectively. (See
`BinaryFormat/Wasm.h`) `tag` exists when the catch is one of `CATCH` or
`CATCH_REF`.
The MIR format is printed as just the list of raw operands. The
(stack-based) assembly instruction supports pretty-printing, including
printing `catch` clauses by name, in InstPrinter.

In addition to the new instructions `TRY_TABLE` and `THROW_REF`, this
adds four pseudo instructions: `CATCH`, `CATCH_REF`, `CATCH_ALL`, and
`CATCH_ALL_REF`. These are pseudo instructions to simulate block return
values of `catch`, `catch_ref`, `catch_all`, `catch_all_ref` clauses in
`try_table` respectively, given that we don't support block return
values except for one case (`fixEndsAtEndOfFunction` in CFGStackify).
These will be omitted when we lower the instructions to `MCInst` at the
end.

LateEHPrepare now will have one more stage to covert
`CATCH`/`CATCH_ALL`s to `CATCH_REF`/`CATCH_ALL_REF`s when there is a
`RETHROW` to rethrow its exception. The pass also converts `RETHROW`s
into `THROW_REF`. Note that we still use `RETHROW` as an interim pseudo
instruction until we convert them to `THROW_REF` in LateEHPrepare.

CFGStackify has a new `placeTryTableMarker` function, which places
`try_table`/`end_try_table` markers with a necessary `catch` clause and
also `block`/`end_block` markers for the destination of the `catch`
clause.

In MCInstLower, now we need to support one more case for the multivalue
block signature (`catch_ref`'s destination's `(i32, exnref)` return
type).

InstPrinter has a new routine to print the `catch_list` type, which is
used to print `try_table` instructions.

The new test, `exception.ll`'s source is the same as
`exception-legacy.ll`, with the FileCheck expectations changed. One
difference is the commands in this file have `-wasm-enable-exnref` to
test the new format, and don't have `-wasm-disable-explicit-locals
-wasm-keep-registers`, because the new custom InstPrinter routine to
print `catch_list` only works for the stack-based instructions (`_S`),
and we can't use `-wasm-keep-registers` for them.

As in `exception-legacy.ll`, the FileCheck lines for the new tests do
not contain the whole program; they mostly contain only the control flow
instructions for readability.
---
 llvm/include/llvm/BinaryFormat/Wasm.h         |   8 +
 .../AsmParser/WebAssemblyAsmParser.cpp        |  10 +-
 .../MCTargetDesc/WebAssemblyInstPrinter.cpp   |  41 ++
 .../MCTargetDesc/WebAssemblyInstPrinter.h     |   1 +
 .../MCTargetDesc/WebAssemblyMCTargetDesc.h    |  30 ++
 .../MCTargetDesc/WebAssemblyMCTypeUtilities.h |  14 +-
 .../WebAssembly/WebAssemblyAsmPrinter.cpp     |  11 +
 .../WebAssembly/WebAssemblyCFGStackify.cpp    | 322 +++++++++++-
 .../WebAssembly/WebAssemblyISelDAGToDAG.cpp   |   5 +-
 .../WebAssembly/WebAssemblyInstrControl.td    |  36 +-
 .../WebAssembly/WebAssemblyLateEHPrepare.cpp  |  76 ++-
 .../WebAssembly/WebAssemblyMCInstLower.cpp    |  22 +-
 .../WebAssembly/WebAssemblyUtilities.cpp      |   2 +
 .../CodeGen/WebAssembly/exception-legacy.ll   |   2 +-
 llvm/test/CodeGen/WebAssembly/exception.ll    | 470 ++++++++++++++++++
 15 files changed, 1009 insertions(+), 41 deletions(-)
 create mode 100644 llvm/test/CodeGen/WebAssembly/exception.ll

diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h
index acf89885af6fd..9b21d6d65c2a8 100644
--- a/llvm/include/llvm/BinaryFormat/Wasm.h
+++ b/llvm/include/llvm/BinaryFormat/Wasm.h
@@ -144,6 +144,14 @@ enum : unsigned {
   WASM_OPCODE_I32_RMW_CMPXCHG = 0x48,
 };
 
+// Sub-opcodes for catch clauses in a try_table instruction
+enum : unsigned {
+  WASM_OPCODE_CATCH = 0x00,
+  WASM_OPCODE_CATCH_REF = 0x01,
+  WASM_OPCODE_CATCH_ALL = 0x02,
+  WASM_OPCODE_CATCH_ALL_REF = 0x03,
+};
+
 enum : unsigned {
   WASM_LIMITS_FLAG_NONE = 0x0,
   WASM_LIMITS_FLAG_HAS_MAX = 0x1,
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 24a9ad67cfe04..5299e6ea06f0b 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -45,7 +45,7 @@ namespace {
 /// WebAssemblyOperand - Instances of this class represent the operands in a
 /// parsed Wasm machine instruction.
 struct WebAssemblyOperand : public MCParsedAsmOperand {
-  enum KindTy { Token, Integer, Float, Symbol, BrList } Kind;
+  enum KindTy { Token, Integer, Float, Symbol, BrList, CatchList } Kind;
 
   SMLoc StartLoc, EndLoc;
 
@@ -99,6 +99,7 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
   bool isMem() const override { return false; }
   bool isReg() const override { return false; }
   bool isBrList() const { return Kind == BrList; }
+  bool isCatchList() const { return Kind == CatchList; }
 
   MCRegister getReg() const override {
     llvm_unreachable("Assembly inspects a register operand");
@@ -151,6 +152,10 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
       Inst.addOperand(MCOperand::createImm(Br));
   }
 
+  void addCatchListOperands(MCInst &Inst, unsigned N) const {
+    // TODO
+  }
+
   void print(raw_ostream &OS) const override {
     switch (Kind) {
     case Token:
@@ -168,6 +173,9 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
     case BrList:
       OS << "BrList:" << BrL.List.size();
       break;
+    case CatchList:
+      // TODO
+      break;
     }
   }
 };
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index b85ed1d93593b..903dbcd21ea96 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -367,3 +367,44 @@ void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
     }
   }
 }
+
+void WebAssemblyInstPrinter::printCatchList(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned OpIdx = OpNo;
+  const MCOperand &Op = MI->getOperand(OpIdx++);
+  unsigned NumCatches = Op.getImm();
+
+  auto PrintTagOp = [&](const MCOperand &Op) {
+    const MCSymbolRefExpr *TagExpr = nullptr;
+    const MCSymbolWasm *TagSym = nullptr;
+    assert(Op.isExpr());
+    TagExpr = dyn_cast<MCSymbolRefExpr>(Op.getExpr());
+    TagSym = cast<MCSymbolWasm>(&TagExpr->getSymbol());
+    O << TagSym->getName() << " ";
+  };
+
+  for (unsigned I = 0; I < NumCatches; I++) {
+    const MCOperand &Op = MI->getOperand(OpIdx++);
+    O << "(";
+    switch (Op.getImm()) {
+    case wasm::WASM_OPCODE_CATCH:
+      O << "catch ";
+      PrintTagOp(MI->getOperand(OpIdx++));
+      break;
+    case wasm::WASM_OPCODE_CATCH_REF:
+      O << "catch_ref ";
+      PrintTagOp(MI->getOperand(OpIdx++));
+      break;
+    case wasm::WASM_OPCODE_CATCH_ALL:
+      O << "catch_all ";
+      break;
+    case wasm::WASM_OPCODE_CATCH_ALL_REF:
+      O << "catch_all_ref ";
+      break;
+    }
+    O << MI->getOperand(OpIdx++).getImm(); // destination
+    O << ")";
+    if (I < NumCatches - 1)
+      O << " ";
+  }
+}
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index 8fd54d1640905..b499926ab8296 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -47,6 +47,7 @@ class WebAssemblyInstPrinter final : public MCInstPrinter {
                                       raw_ostream &O);
   void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O);
+  void printCatchList(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   // Autogenerated by tblgen.
   std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 00f15e1db5e13..e3a60fa4812d8 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -87,6 +87,8 @@ enum OperandType {
   OPERAND_BRLIST,
   /// 32-bit unsigned table number.
   OPERAND_TABLE,
+  /// A list of catch clauses for try_table.
+  OPERAND_CATCH_LIST,
 };
 } // end namespace WebAssembly
 
@@ -119,6 +121,10 @@ enum TOF {
   // address relative the __table_base wasm global.
   // Only applicable to function symbols.
   MO_TABLE_BASE_REL,
+
+  // On a block signature operand this indicates that this is a destination
+  // block of a (catch_ref) clause in try_table.
+  MO_CATCH_BLOCK_SIG,
 };
 
 } // end namespace WebAssemblyII
@@ -462,6 +468,22 @@ inline bool isMarker(unsigned Opc) {
   case WebAssembly::TRY_S:
   case WebAssembly::END_TRY:
   case WebAssembly::END_TRY_S:
+  case WebAssembly::TRY_TABLE:
+  case WebAssembly::TRY_TABLE_S:
+  case WebAssembly::END_TRY_TABLE:
+  case WebAssembly::END_TRY_TABLE_S:
+    return true;
+  default:
+    return false;
+  }
+}
+
+inline bool isTry(unsigned Opc) {
+  switch (Opc) {
+  case WebAssembly::TRY:
+  case WebAssembly::TRY_S:
+  case WebAssembly::TRY_TABLE:
+  case WebAssembly::TRY_TABLE_S:
     return true;
   default:
     return false;
@@ -474,6 +496,14 @@ inline bool isCatch(unsigned Opc) {
   case WebAssembly::CATCH_LEGACY_S:
   case WebAssembly::CATCH_ALL_LEGACY:
   case WebAssembly::CATCH_ALL_LEGACY_S:
+  case WebAssembly::CATCH:
+  case WebAssembly::CATCH_S:
+  case WebAssembly::CATCH_REF:
+  case WebAssembly::CATCH_REF_S:
+  case WebAssembly::CATCH_ALL:
+  case WebAssembly::CATCH_ALL_S:
+  case WebAssembly::CATCH_ALL_REF:
+  case WebAssembly::CATCH_ALL_REF_S:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
index 063ee4dba9068..4aca092e0e4c4 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
@@ -33,11 +33,15 @@ enum class BlockType : unsigned {
   Externref = unsigned(wasm::ValType::EXTERNREF),
   Funcref = unsigned(wasm::ValType::FUNCREF),
   Exnref = unsigned(wasm::ValType::EXNREF),
-  // Multivalue blocks (and other non-void blocks) are only emitted when the
-  // blocks will never be exited and are at the ends of functions (see
-  // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made
-  // to pop values off the stack, so the exact multivalue signature can always
-  // be inferred from the return type of the parent function in MCInstLower.
+  // Multivalue blocks are emitted in two cases:
+  // 1. When the blocks will never be exited and are at the ends of functions
+  //    (see WebAssemblyCFGStackify::fixEndsAtEndOfFunction). In this case the
+  //    exact multivalue signature can always be inferred from the return type
+  //    of the parent function.
+  // 2. (catch_ref ...) clause in try_table instruction. Currently all tags we
+  //    support (cpp_exception and c_longjmp) throws a single i32, so the
+  //    multivalue signature for this case will be (i32, exnref).
+  // The real multivalue siganture will be added in MCInstLower.
   Multivalue = 0xffff,
 };
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 6dd6145ed0057..14c0eaac17daa 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -683,6 +683,17 @@ void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // This is a compiler barrier that prevents instruction reordering during
     // backend compilation, and should not be emitted.
     break;
+  case WebAssembly::CATCH:
+  case WebAssembly::CATCH_S:
+  case WebAssembly::CATCH_REF:
+  case WebAssembly::CATCH_REF_S:
+  case WebAssembly::CATCH_ALL:
+  case WebAssembly::CATCH_ALL_S:
+  case WebAssembly::CATCH_ALL_REF:
+  case WebAssembly::CATCH_ALL_REF_S:
+    // These are pseudo instructions to represent catch clauses in try_table
+    // instruction to simulate block return values.
+    break;
   default: {
     WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
     MCInst TmpInst;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 3cccc57e629fd..a5f73fabca354 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -9,9 +9,9 @@
 /// \file
 /// This file implements a CFG stacking pass.
 ///
-/// This pass inserts BLOCK, LOOP, and TRY markers to mark the start of scopes,
-/// since scope boundaries serve as the labels for WebAssembly's control
-/// transfers.
+/// This pass inserts BLOCK, LOOP, TRY, and TRY_TABLE markers to mark the start
+/// of scopes, since scope boundaries serve as the labels for WebAssembly's
+/// control transfers.
 ///
 /// This is sufficient to convert arbitrary CFGs into a form that works on
 /// WebAssembly, provided that all loops are single-entry.
@@ -21,6 +21,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "Utils/WebAssemblyTypeUtilities.h"
 #include "WebAssembly.h"
 #include "WebAssemblyExceptionInfo.h"
@@ -29,6 +30,7 @@
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -74,6 +76,7 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
   void placeBlockMarker(MachineBasicBlock &MBB);
   void placeLoopMarker(MachineBasicBlock &MBB);
   void placeTryMarker(MachineBasicBlock &MBB);
+  void placeTryTableMarker(MachineBasicBlock &MBB);
 
   // Exception handling related functions
   bool fixCallUnwindMismatches(MachineFunction &MF);
@@ -97,11 +100,11 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
   void fixEndsAtEndOfFunction(MachineFunction &MF);
   void cleanupFunctionData(MachineFunction &MF);
 
-  // For each BLOCK|LOOP|TRY, the corresponding END_(BLOCK|LOOP|TRY) or DELEGATE
-  // (in case of TRY).
+  // For each BLOCK|LOOP|TRY|TRY_TABLE, the corresponding
+  // END_(BLOCK|LOOP|TRY|TRY_TABLE) or DELEGATE (in case of TRY).
   DenseMap<const MachineInstr *, MachineInstr *> BeginToEnd;
-  // For each END_(BLOCK|LOOP|TRY) or DELEGATE, the corresponding
-  // BLOCK|LOOP|TRY.
+  // For each END_(BLOCK|LOOP|TRY|TRY_TABLE) or DELEGATE, the corresponding
+  // BLOCK|LOOP|TRY|TRY_TABLE.
   DenseMap<const MachineInstr *, MachineInstr *> EndToBegin;
   // <TRY marker, EH pad> map
   DenseMap<const MachineInstr *, MachineBasicBlock *> TryToEHPad;
@@ -150,9 +153,10 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
 } // end anonymous namespace
 
 char WebAssemblyCFGStackify::ID = 0;
-INITIALIZE_PASS(WebAssemblyCFGStackify, DEBUG_TYPE,
-                "Insert BLOCK/LOOP/TRY markers for WebAssembly scopes", false,
-                false)
+INITIALIZE_PASS(
+    WebAssemblyCFGStackify, DEBUG_TYPE,
+    "Insert BLOCK/LOOP/TRY/TRY_TABLE markers for WebAssembly scopes", false,
+    false)
 
 FunctionPass *llvm::createWebAssemblyCFGStackify() {
   return new WebAssemblyCFGStackify();
@@ -314,12 +318,13 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
 #endif
     }
 
-    // If there is a previously placed BLOCK/TRY marker and its corresponding
-    // END marker is before the current BLOCK's END marker, that should be
-    // placed after this BLOCK. Otherwise it should be placed before this BLOCK
-    // marker.
+    // If there is a previously placed BLOCK/TRY/TRY_TABLE marker and its
+    // corresponding END marker is before the current BLOCK's END marker, that
+    // should be placed after this BLOCK. Otherwise it should be placed before
+    // this BLOCK marker.
     if (MI.getOpcode() == WebAssembly::BLOCK ||
-        MI.getOpcode() == WebAssembly::TRY) {
+        MI.getOpcode() == WebAssembly::TRY ||
+        MI.getOpcode() == WebAssembly::TRY_TABLE) {
       if (BeginToEnd[&MI]->getParent()->getNumber() <= MBB.getNumber())
         AfterSet.insert(&MI);
 #ifndef NDEBUG
@@ -329,10 +334,11 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
     }
 
 #ifndef NDEBUG
-    // All END_(BLOCK|LOOP|TRY) markers should be before the BLOCK.
+    // All END_(BLOCK|LOOP|TRY|TRY_TABLE) markers should be before the BLOCK.
     if (MI.getOpcode() == WebAssembly::END_BLOCK ||
         MI.getOpcode() == WebAssembly::END_LOOP ||
-        MI.getOpcode() == WebAssembly::END_TRY)
+        MI.getOpcode() == WebAssembly::END_TRY ||
+        MI.getOpcode() == WebAssembly::END_TRY_TABLE)
       BeforeSet.insert(&MI);
 #endif
 
@@ -374,6 +380,11 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
     // loop is above this block's header, the END_LOOP should be placed after
     // the END_BLOCK, because the loop contains this block. Otherwise the
     // END_LOOP should be placed before the END_BLOCK. The same for END_TRY.
+    //
+    // Note that while there can be existing END_TRYs, there can't be
+    // END_TRY_TABLEs; END_TRYs are placed when its corresponding EH pad is
+    // processed, so they are placed below MBB (EH pad) in placeTryMarker. But
+    // END_TRY_TABLE is placed like a END_BLOCK, so they can't be here already.
     if (MI.getOpcode() == WebAssembly::END_LOOP ||
         MI.getOpcode() == WebAssembly::END_TRY) {
       if (EndToBegin[&MI]->getParent()->getNumber() >= Header->getNumber())
@@ -657,7 +668,251 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
     updateScopeTops(Header, End);
 }
 
+void WebAssemblyCFGStackify::placeTryTableMarker(MachineBasicBlock &MBB) {
+  assert(MBB.isEHPad());
+  MachineFunction &MF = *MBB.getParent();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  const auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>();
+  SortRegionInfo SRI(MLI, WEI);
+  const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+
+  // Compute the nearest common dominator of all unwind predecessors
+  MachineBasicBlock *Header = nullptr;
+  int MBBNumber = MBB.getNumber();
+  for (auto *Pred : MBB.predecessors()) {
+    if (Pred->getNumber() < MBBNumber) {
+      Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred;
+      assert(!explicitlyBranchesTo(Pred, &MBB) &&
+             "Explicit branch to an EH pad!");
+    }
+  }
+  if (!Header)
+    return;
+
+  assert(&MBB != &MF.front() && "Header blocks shouldn't have predecessors");
+  MachineBasicBlock *LayoutPred = MBB.getPrevNode();
+
+  // If the nearest common dominator is inside a more deeply nested context,
+  // walk out to the nearest scope which isn't more deeply nested.
+  for (MachineFunction::iterator I(LayoutPred), E(Header); I != E; --I) {
+    if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) {
+      if (ScopeTop->getNumber() > Header->getNumber()) {
+        // Skip over an intervening scope.
+        I = std::next(ScopeTop->getIterator());
+      } else {
+        // We found a scope level at an appropriate depth.
+        Header = ScopeTop;
+        break;
+      }
+    }
+  }
+
+  // Decide where in Header to put the TRY_TABLE.
+
+  // Instructions that should go before the TRY_TABLE.
+  SmallPtrSet<const MachineInstr *, 4> BeforeSet;
+  // Instructions that should go after the TRY_TABLE.
+  SmallPtrSet<const MachineInstr *, 4> AfterSet;
+  for (const auto &MI : *Header) {
+    // If there is a previously placed LOOP marker and the bottom block of the
+    // loop is above MBB, it should be after the TRY_TABLE, because the loop is
+    // nested in this TRY_TABLE. Otherwise it should be before the TRY_TABLE.
+    if (MI.getOpcode() == WebAssembly::LOOP) {
+      auto *LoopBottom = BeginToEnd[&MI]->getParent()->getPrevNode();
+      if (MBB.getNumber() > LoopBottom->getNumber())
+        AfterSet.insert(&MI);
+#ifndef NDEBUG
+      else
+        BeforeSet.insert(&MI);
+#endif
+    }
+
+    // All previously inserted BLOCK/TRY_TABLE markers should be after the
+    // TRY_TABLE because they are all nested blocks/try_tables.
+    if (MI.getOpcode() == WebAssembly::BLOCK ||
+        MI.getOpcode() == WebAssembly::TRY_TABLE)
+      AfterSet.insert(&MI);
+
+#ifndef NDEBUG
+    // All END_(BLOCK/LOOP/TRY_TABLE) markers should be before the TRY_TABLE.
+    if (MI.getOpcode() == WebAssembly::END_BLOCK ||
+        MI.getOpcode() == WebAssembly::END_LOOP ||
+        MI.getOpcode() == WebAssembly::END_TRY_TABLE)
+      BeforeSet.insert(&MI);
+#endif
+
+    // Terminators should go after the TRY_TABLE.
+    if (MI.isTerminator())
+      AfterSet.insert(&MI);
+  }
+
+  // If Header unwinds to MBB (= Header contains 'invoke'), the try_table block
+  // should contain the call within it. So the call should go after the
+  // TRY_TABLE. The exception is when the header's terminator is a rethrow
+  // instruction, in which case that instruction, not a call instruction before
+  // it, is gonna throw.
+  MachineInstr *ThrowingCall = nullptr;
+  if (MBB.isPredecessor(Header)) {
+    auto TermPos = Header->getFirstTerminator();
+    if (TermPos == Header->end() ||
+        TermPos->getOpcode() != WebAssembly::RETHROW) {
+      for (auto &MI : reverse(*Header)) {
+        if (MI.isCall()) {
+          AfterSet.insert(&MI);
+          ThrowingCall = &MI;
+          // Possibly throwing calls are usually wrapped by EH_LABEL
+          // instructions. We don't want to split them and the call.
+          if (MI.getIterator() != Header->begin() &&
+              std::prev(MI.getIterator())->isEHLabel()) {
+            AfterSet.insert(&*std::prev(MI.getIterator()));
+            ThrowingCall = &*std::prev(MI.getIterator());
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  // Local expression tree should go after the TRY_TABLE.
+  // For BLOCK placement, we start the search from the previous instruction of a
+  // BB's terminator, but in TRY_TABLE's case, we should start from the previous
+  // instruction of a call that can throw, or a EH_LABEL that precedes the call,
+  // because the return values of the call's previous instructions can be
+  // stackified and consumed by the throwing call.
+  auto SearchStartPt = ThrowingCall ? MachineBasicBlock::iterator(ThrowingCall)
+                                    : Header->getFirstTerminator();
+  for (auto I = SearchStartPt, E = Header->begin(); I != E; --I) {
+    if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition())
+      continue;
+    if (WebAssembly::isChild(*std::prev(I), MFI))
+      AfterSet.insert(&*std::prev(I));
+    else
+      break;
+  }
+
+  // Add the TRY_TABLE and a BLOCK for the catch destination. We currently
+  // generate only one CATCH clause for a TRY_TABLE, so we need one BLOCK for
+  // its destination.
+  //
+  // Header:
+  //   block
+  //     try_table (catch ... $MBB)
+  //       ...
+  //
+  // MBB:
+  //     end_try_table
+  //   end_block                 ;; destination of (catch ...)
+  //   ... catch handler body ...
+  auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet);
+  MachineInstrBuilder BlockMIB =
+      BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
+              TII.get(WebAssembly::BLOCK));
+  auto *Block = BlockMIB.getInstr();
+  MachineInstrBuilder TryTableMIB =
+      BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
+              TII.get(WebAssembly::TRY_TABLE))
+          .addImm(int64_t(WebAssembly::BlockType::Void))
+          .addImm(1); // # of catch clauses
+  auto *TryTable = TryTableMIB.getInstr();
+
+  // Add a CATCH_*** clause to the TRY_TABLE. These are pseudo instructions
+  // following the destination END_BLOCK to simulate block return values,
+  // because we currently don't support them.
+  auto *Catch = WebAssembly::findCatch(&MBB);
+  switch (Catch->getOpcode()) {
+  case WebAssembly::CATCH:
+    // CATCH's destination block's return type is the extracted value type,
+    // which is currently i32 for all supported tags.
+    BlockMIB.addImm(int64_t(WebAssembly::BlockType::I32));
+    TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH);
+    for (const auto &Use : Catch->uses()) {
+      // The only use operand a CATCH can have is the tag symbol.
+      TryTableMIB.addExternalSymbol(Use.getSymbolName());
+      break;
+    }
+    TryTableMIB.addMBB(&MBB);
+    break;
+  case WebAssembly::CATCH_REF:
+    // CATCH_REF's destination block's return type is the extracted value type
+    // followed by an exnref, which is (i32, exnref) in our case. We assign the
+    // actual multiavlue signature in MCInstLower. MO_CATCH_BLOCK_SIG signals
+    // that this operand is used for catch_ref's multivalue destination.
+    BlockMIB.addImm(int64_t(WebAssembly::BlockType::Multivalue));
+    Block->getOperand(0).setTargetFlags(WebAssemblyII::MO_CATCH_BLOCK_SIG);
+    TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH_REF);
+    for (const auto &Use : Catch->uses()) {
+      TryTableMIB.addExternalSymbol(Use.getSymbolName());
+      break;
+    }
+    TryTableMIB.addMBB(&MBB);
+    break;
+  case WebAssembly::CATCH_ALL:
+    // CATCH_ALL's destination block's return type is void.
+    BlockMIB.addImm(int64_t(WebAssembly::BlockType::Void));
+    TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH_ALL);
+    TryTableMIB.addMBB(&MBB);
+    break;
+  case WebAssembly::CATCH_ALL_REF:
+    // CATCH_ALL_REF's destination block's return type is exnref.
+    BlockMIB.addImm(int64_t(WebAssembly::BlockType::Exnref));
+    TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH_ALL_REF);
+    TryTableMIB.addMBB(&MBB);
+    break;
+  }
+
+  // Decide where in MBB to put the END_TRY_TABLE, and the END_BLOCK for the
+  // CATCH destination.
+  BeforeSet.clear();
+  AfterSet.clear();
+  for (const auto &MI : MBB) {
+#ifndef NDEBUG
+    // END_TRY_TABLE should precede existing LOOP markers.
+    if (MI.getOpcode() == WebAssembly::LOOP)
+      AfterSet.insert(&MI);
+#endif
+
+    // If there is a previously placed END_LOOP marker and the header of the
+    // loop is above this try_table's header, the END_LOOP should be placed
+    // after the END_TRY_TABLE, because the loop contains this block. Otherwise
+    // the END_LOOP should be placed before the END_TRY_TABLE.
+    if (MI.getOpcode() == WebAssembly::END_LOOP) {
+      if (EndToBegin[&MI]->getParent()->getNumber() >= Header->getNumber())
+        BeforeSet.insert(&MI);
+#ifndef NDEBUG
+      else
+        AfterSet.insert(&MI);
+#endif
+    }
+
+#ifndef NDEBUG
+    // CATCH, CATCH_REF, CATCH_ALL, and CATCH_ALL_REF are pseudo-instructions
+    // that simulate the block return value, so they should be placed after the
+    // END_TRY_TABLE.
+    if (WebAssembly::isCatch(MI.getOpcode()))
+      AfterSet.insert(&MI);
+#endif
+  }
+
+  // Mark the end of the TRY_TABLE and the BLOCK.
+  InsertPos = getEarliestInsertPos(&MBB, BeforeSet, AfterSet);
+  MachineInstr *EndTryTable =
+      BuildMI(MBB, InsertPos, MBB.findPrevDebugLoc(InsertPos),
+              TII.get(WebAssembly::END_TRY_TABLE));
+  registerTryScope(TryTable, EndTryTable, &MBB);
+  MachineInstr *EndBlock =
+      BuildMI(MBB, InsertPos, MBB.findPrevDebugLoc(InsertPos),
+              TII.get(WebAssembly::END_BLOCK));
+  registerScope(Block, EndBlock);
+  // Track the farthest-spanning scope that ends at this point.
+  updateScopeTops(Header, &MBB);
+}
+
 void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
+  if (WebAssembly::WasmEnableExnref)
+    return;
+
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
 
   // When there is an unconditional branch right before a catch instruction and
@@ -1445,6 +1700,7 @@ void WebAssemblyCFGStackify::recalculateScopeTops(MachineFunction &MF) {
       case WebAssembly::END_BLOCK:
       case WebAssembly::END_LOOP:
       case WebAssembly::END_TRY:
+      case WebAssembly::END_TRY_TABLE:
       case WebAssembly::DELEGATE:
         updateScopeTops(EndToBegin[&MI]->getParent(), &MBB);
         break;
@@ -1502,6 +1758,7 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) {
       }
       case WebAssembly::END_BLOCK:
       case WebAssembly::END_LOOP:
+      case WebAssembly::END_TRY_TABLE:
       case WebAssembly::DELEGATE:
         EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType));
         continue;
@@ -1528,7 +1785,7 @@ static void appendEndToFunction(MachineFunction &MF,
           TII.get(WebAssembly::END_FUNCTION));
 }
 
-/// Insert BLOCK/LOOP/TRY markers at appropriate places.
+/// Insert BLOCK/LOOP/TRY/TRY_TABLE markers at appropriate places.
 void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) {
   // We allocate one more than the number of blocks in the function to
   // accommodate for the possible fake block we may insert at the end.
@@ -1540,15 +1797,25 @@ void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) {
   const MCAsmInfo *MCAI = MF.getTarget().getMCAsmInfo();
   for (auto &MBB : MF) {
     if (MBB.isEHPad()) {
-      // Place the TRY for MBB if MBB is the EH pad of an exception.
+      // Place the TRY/TRY_TABLE for MBB if MBB is the EH pad of an exception.
       if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
-          MF.getFunction().hasPersonalityFn())
-        placeTryMarker(MBB);
+          MF.getFunction().hasPersonalityFn()) {
+        if (WebAssembly::WasmEnableExnref)
+          placeTryTableMarker(MBB);
+        else
+          placeTryMarker(MBB);
+      }
     } else {
       // Place the BLOCK for MBB if MBB is branched to from above.
       placeBlockMarker(MBB);
     }
   }
+
+  // FIXME We return here temporarily until we implement fixing unwind
+  // mismatches for the new exnref proposal.
+  if (WebAssembly::WasmEnableExnref)
+    return;
+
   // Fix mismatches in unwind destinations induced by linearizing the code.
   if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
       MF.getFunction().hasPersonalityFn()) {
@@ -1668,11 +1935,14 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
   for (auto &MBB : reverse(MF)) {
     for (MachineInstr &MI : llvm::reverse(MBB)) {
       switch (MI.getOpcode()) {
+      case WebAssembly::TRY_TABLE:
+        RewriteOperands(MI);
+        [[fallthrough]];
       case WebAssembly::BLOCK:
       case WebAssembly::TRY:
         assert(ScopeTops[Stack.back().first->getNumber()]->getNumber() <=
                    MBB.getNumber() &&
-               "Block/try marker should be balanced");
+               "Block/try/try_table marker should be balanced");
         Stack.pop_back();
         break;
 
@@ -1687,6 +1957,7 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
         [[fallthrough]];
       }
       case WebAssembly::END_BLOCK:
+      case WebAssembly::END_TRY_TABLE:
         Stack.push_back(std::make_pair(&MBB, &MI));
         break;
 
@@ -1744,7 +2015,8 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   // Liveness is not tracked for VALUE_STACK physreg.
   MF.getRegInfo().invalidateLiveness();
 
-  // Place the BLOCK/LOOP/TRY markers to indicate the beginnings of scopes.
+  // Place the BLOCK/LOOP/TRY/TRY_TABLE markers to indicate the beginnings of
+  // scopes.
   placeMarkers(MF);
 
   // Remove unnecessary instructions possibly introduced by try/end_trys.
@@ -1755,8 +2027,8 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   // Convert MBB operands in terminators to relative depth immediates.
   rewriteDepthImmediates(MF);
 
-  // Fix up block/loop/try signatures at the end of the function to conform to
-  // WebAssembly's rules.
+  // Fix up block/loop/try/try_table signatures at the end of the function to
+  // conform to WebAssembly's rules.
   fixEndsAtEndOfFunction(MF);
 
   // Add an end instruction at the end of the function body.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 60c5e18fbb0cd..b5b9cbeacfa18 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -211,8 +211,11 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::wasm_catch: {
       int Tag = Node->getConstantOperandVal(2);
       SDValue SymNode = getTagSymNode(Tag, CurDAG);
+      unsigned CatchOpcode = WebAssembly::WasmEnableExnref
+                                 ? WebAssembly::CATCH
+                                 : WebAssembly::CATCH_LEGACY;
       MachineSDNode *Catch =
-          CurDAG->getMachineNode(WebAssembly::CATCH_LEGACY, DL,
+          CurDAG->getMachineNode(CatchOpcode, DL,
                                  {
                                      PtrVT,     // exception pointer
                                      MVT::Other // outchain type
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 05880b89d1fbc..97ff6d77f54b1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -125,15 +125,46 @@ defm DEBUG_UNREACHABLE : NRI<(outs), (ins), [(debugtrap)], "unreachable", 0x00>;
 // Exception handling instructions
 //===----------------------------------------------------------------------===//
 
+// A list of catch clauses attached to try_table.
+def CatchListAsmOperand : AsmOperandClass { let Name = "CatchList"; }
+let OperandNamespace = "WebAssembly", OperandType = "OPERAND_CATCH_LIST" in
+def catch_list : Operand<i32> {
+  let ParserMatchClass = CatchListAsmOperand;
+  let PrintMethod = "printCatchList";
+}
+
 let Predicates = [HasExceptionHandling] in {
 
-// Throwing an exception: throw
+// Throwing an exception: throw / throw_ref
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 defm THROW : I<(outs), (ins tag_op:$tag, variable_ops),
                (outs), (ins tag_op:$tag), [],
                "throw   \t$tag", "throw   \t$tag", 0x08>;
+defm THROW_REF : I<(outs), (ins EXNREF:$exn), (outs), (ins), [],
+                   "throw_ref \t$exn", "throw_ref", 0x0a>;
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
+// Region within which an exception is caught: try / end_try
+let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
+defm TRY_TABLE : I<(outs), (ins Signature:$sig, variable_ops),
+                   (outs), (ins Signature:$sig, catch_list:$cal), [],
+                   "try_table \t$sig", "try_table \t$sig $cal", 0x1f>;
+defm END_TRY_TABLE : NRI<(outs), (ins), [], "end_try_table", 0x0b>;
+} // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
+
+// Pseudo instructions that represent catch / catch_ref / catch_all /
+// catch_all_ref clauses in a try_table instruction.
+let hasCtrlDep = 1, hasSideEffects = 1, isCodeGenOnly = 1 in {
+let variadicOpsAreDefs = 1 in {
+defm CATCH : I<(outs), (ins tag_op:$tag, variable_ops),
+               (outs), (ins tag_op:$tag), []>;
+defm CATCH_REF : I<(outs), (ins tag_op:$tag, variable_ops),
+                   (outs), (ins tag_op:$tag), []>;
+}
+defm CATCH_ALL : NRI<(outs), (ins), []>;
+defm CATCH_ALL_REF : I<(outs EXNREF:$dst), (ins), (outs), (ins), []>;
+}
+
 // Pseudo instructions: cleanupret / catchret
 let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
     isPseudo = 1, isEHScopeReturn = 1 in {
@@ -147,9 +178,10 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
 // usage gets low enough.
 
 // Rethrowing an exception: rethrow
+// The new exnref proposal also uses this instruction as an interim pseudo
+// instruction before we convert it to a THROW_REF.
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in
 defm RETHROW : NRI<(outs), (ins i32imm:$depth), [], "rethrow \t$depth", 0x09>;
-// isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 // The depth argument will be computed in CFGStackify. We set it to 0 here for
 // now.
 def : Pat<(int_wasm_rethrow), (RETHROW 0)>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index f0c205cdb6aeb..70b406b6552bf 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -37,6 +37,7 @@ class WebAssemblyLateEHPrepare final : public MachineFunctionPass {
   void recordCatchRetBBs(MachineFunction &MF);
   bool hoistCatches(MachineFunction &MF);
   bool addCatchAlls(MachineFunction &MF);
+  bool addCatchRefsAndThrowRefs(MachineFunction &MF);
   bool replaceFuncletReturns(MachineFunction &MF);
   bool removeUnnecessaryUnreachables(MachineFunction &MF);
   bool restoreStackPointer(MachineFunction &MF);
@@ -127,6 +128,8 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
     Changed |= hoistCatches(MF);
     Changed |= addCatchAlls(MF);
     Changed |= replaceFuncletReturns(MF);
+    if (WebAssembly::WasmEnableExnref)
+      Changed |= addCatchRefsAndThrowRefs(MF);
   }
   Changed |= removeUnnecessaryUnreachables(MF);
   if (MF.getFunction().hasPersonalityFn())
@@ -214,9 +217,12 @@ bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) {
     if (InsertPos == MBB.end() ||
         !WebAssembly::isCatch(InsertPos->getOpcode())) {
       Changed = true;
+      unsigned CatchAllOpcode = WebAssembly::WasmEnableExnref
+                                    ? WebAssembly::CATCH_ALL
+                                    : WebAssembly::CATCH_ALL_LEGACY;
       BuildMI(MBB, InsertPos,
               InsertPos == MBB.end() ? DebugLoc() : InsertPos->getDebugLoc(),
-              TII.get(WebAssembly::CATCH_ALL_LEGACY));
+              TII.get(CatchAllOpcode));
     }
   }
   return Changed;
@@ -248,6 +254,10 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
     case WebAssembly::CLEANUPRET: {
       // Replace a cleanupret with a rethrow. For C++ support, currently
       // rethrow's immediate argument is always 0 (= the latest exception).
+      //
+      // Even when -wasm-enable-exnref is true, we use a RETHROW here for the
+      // moment. This will be converted to a THROW_REF in
+      // addCatchRefsAndThrowRefs.
       BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW))
           .addImm(0);
       TI->eraseFromParent();
@@ -259,14 +269,74 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
   return Changed;
 }
 
-// Remove unnecessary unreachables after a throw or rethrow.
+// Add CATCH_REF and CATCH_ALL_REF pseudo instructions to EH pads, and convert
+// RETHROWs to THROW_REFs.
+bool WebAssemblyLateEHPrepare::addCatchRefsAndThrowRefs(MachineFunction &MF) {
+  bool Changed = false;
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
+  DenseMap<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> EHPadToRethrows;
+
+  // Create a map of <EH pad, a vector of RETHROWs rethrowing its exception>
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (MI.getOpcode() == WebAssembly::RETHROW) {
+        Changed = true;
+        auto *EHPad = getMatchingEHPad(&MI);
+        EHPadToRethrows[EHPad].push_back(&MI);
+      }
+    }
+  }
+
+  // Convert CATCH into CATCH_REF and CATCH_ALL into CATCH_ALL_REF, when the
+  // caught exception is rethrown. And convert RETHROWs to THROW_REFs.
+  for (auto &[EHPad, Rethrows] : EHPadToRethrows) {
+    auto *Catch = WebAssembly::findCatch(EHPad);
+    auto *InsertPos = Catch->getIterator()->getNextNode();
+    auto ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
+    if (Catch->getOpcode() == WebAssembly::CATCH) {
+      MachineInstrBuilder MIB = BuildMI(*EHPad, InsertPos, Catch->getDebugLoc(),
+                                        TII.get(WebAssembly::CATCH_REF));
+      // Copy defs (= extracted values) from the old CATCH to the new CATCH_REF
+      for (const auto &Def : Catch->defs())
+        MIB.addDef(Def.getReg());
+      MIB.addDef(ExnReg); // Attach the exnref def after extracted values
+      // Copy the tag symbol (The only use operand a CATCH can have is the tag
+      // symbol)
+      for (const auto &Use : Catch->uses()) {
+        MIB.addExternalSymbol(Use.getSymbolName());
+        break;
+      }
+    } else if (Catch->getOpcode() == WebAssembly::CATCH_ALL) {
+      BuildMI(*EHPad, InsertPos, Catch->getDebugLoc(),
+              TII.get(WebAssembly::CATCH_ALL_REF))
+          .addDef(ExnReg);
+    } else {
+      assert(false);
+    }
+    Catch->eraseFromParent();
+
+    for (auto *Rethrow : Rethrows) {
+      auto InsertPos = std::next(Rethrow->getIterator());
+      BuildMI(*Rethrow->getParent(), InsertPos, Rethrow->getDebugLoc(),
+              TII.get(WebAssembly::THROW_REF))
+          .addReg(ExnReg);
+      Rethrow->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
+// Remove unnecessary unreachables after a throw/rethrow/throw_ref.
 bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
     MachineFunction &MF) {
   bool Changed = false;
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
       if (MI.getOpcode() != WebAssembly::THROW &&
-          MI.getOpcode() != WebAssembly::RETHROW)
+          MI.getOpcode() != WebAssembly::RETHROW &&
+          MI.getOpcode() != WebAssembly::THROW_REF)
         continue;
       Changed = true;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 431dc7f33ac89..73ff50f39b020 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssemblyMCInstLower.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "TargetInfo/WebAssemblyTargetInfo.h"
 #include "Utils/WebAssemblyTypeUtilities.h"
 #include "WebAssemblyAsmPrinter.h"
@@ -220,12 +221,27 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
 
           MCOp = lowerTypeIndexOperand(std::move(Returns), std::move(Params));
           break;
-        } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
+        }
+        if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
           auto BT = static_cast<WebAssembly::BlockType>(MO.getImm());
           assert(BT != WebAssembly::BlockType::Invalid);
           if (BT == WebAssembly::BlockType::Multivalue) {
-            SmallVector<wasm::ValType, 1> Returns;
-            getFunctionReturns(MI, Returns);
+            SmallVector<wasm::ValType, 2> Returns;
+            // Multivalue blocks are emitted in two cases:
+            // 1. When the blocks will never be exited and are at the ends of
+            //    functions (see
+            //    WebAssemblyCFGStackify::fixEndsAtEndOfFunction). In this case
+            //    the exact multivalue signature can always be inferred from the
+            //    return type of the parent function.
+            // 2. (catch_ref ...) clause in try_table instruction. Currently all
+            //    tags we support (cpp_exception and c_longjmp) throws a single
+            //    i32, so the multivalue signature for this case will be (i32,
+            //    exnref). Having MO_CATCH_BLOCK_SIG target flags means this is
+            //    a destination of a catch_ref.
+            if (MO.getTargetFlags() == WebAssemblyII::MO_CATCH_BLOCK_SIG)
+              Returns = {wasm::ValType::I32, wasm::ValType::EXNREF};
+            else
+              getFunctionReturns(MI, Returns);
             MCOp = lowerTypeIndexOperand(std::move(Returns),
                                          SmallVector<wasm::ValType, 4>());
             break;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index c5a047ee47d73..ed186e65a80cf 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -43,6 +43,8 @@ bool WebAssembly::mayThrow(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   case WebAssembly::THROW:
   case WebAssembly::THROW_S:
+  case WebAssembly::THROW_REF:
+  case WebAssembly::THROW_REF_S:
   case WebAssembly::RETHROW:
   case WebAssembly::RETHROW_S:
     return true;
diff --git a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
index aa191209516f6..3327d8be894f0 100644
--- a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
+++ b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
@@ -109,7 +109,7 @@ ehcleanup:                                        ; preds = %entry
 }
 
 ; Calling a function that may throw within a 'catch (...)' generates a
-; temrinatepad, because __cxa_end_catch() also can throw within 'catch (...)'.
+; terminatepad, because __cxa_end_catch() also can throw within 'catch (...)'.
 ;
 ; void foo();
 ; void terminatepad() {
diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception.ll
new file mode 100644
index 0000000000000..7259761d6313b
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/exception.ll
@@ -0,0 +1,470 @@
+; RUN: llc < %s -asm-verbose=false -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref -verify-machineinstrs | FileCheck --implicit-check-not=ehgcr -allow-deprecated-dag-overlap %s
+; RUN: llc < %s -asm-verbose=false -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref -verify-machineinstrs -O0
+; RUN: llc < %s -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref
+
+target triple = "wasm32-unknown-unknown"
+
+%struct.Temp = type { i8 }
+
+@_ZTIi = external dso_local constant ptr
+
+; CHECK: .tagtype  __cpp_exception i32
+
+; CHECK-LABEL: throw:
+; CHECK:     throw __cpp_exception
+; CHECK-NOT: unreachable
+define void @throw(ptr %p) {
+  call void @llvm.wasm.throw(i32 0, ptr %p)
+  ret void
+}
+
+; Simple test with a try-catch
+;
+; void foo();
+; void catch() {
+;   try {
+;     foo();
+;   } catch (int) {
+;   }
+; }
+
+; CHECK-LABEL: catch:
+; CHECK: global.get  __stack_pointer
+; CHECK: local.set  0
+; CHECK: block
+; CHECK:   block     () -> (i32, exnref)
+; CHECK:     try_table    (catch_ref __cpp_exception 0)
+; CHECK:       call  foo
+; CHECK:       br        2
+; CHECK:     end_try_table
+; CHECK:   end_block
+; CHECK:   local.set  2
+; CHECK:   local.get  0
+; CHECK:   global.set  __stack_pointer
+; CHECK:   i32.store  __wasm_lpad_context
+; CHECK:   call  _Unwind_CallPersonality
+; CHECK:   block
+; CHECK:     br_if     0
+; CHECK:     call  __cxa_begin_catch
+; CHECK:     call  __cxa_end_catch
+; CHECK:     br        1
+; CHECK:   end_block
+; CHECK:   local.get  2
+; CHECK:   throw_ref
+; CHECK: end_block
+define void @catch() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr @_ZTIi]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch, label %rethrow
+
+catch:                                            ; preds = %catch.start
+  %5 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ]
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+rethrow:                                          ; preds = %catch.start
+  call void @llvm.wasm.rethrow() [ "funclet"(token %1) ]
+  unreachable
+
+try.cont:                                         ; preds = %catch, %entry
+  ret void
+}
+
+; Destructor (cleanup) test
+;
+; void foo();
+; struct Temp {
+;   ~Temp() {}
+; };
+; void cleanup() {
+;   Temp t;
+;   foo();
+; }
+
+; CHECK-LABEL: cleanup:
+; CHECK: block
+; CHECK:   block     exnref
+; CHECK:     try_table    (catch_all_ref 0)
+; CHECK:       call  foo
+; CHECK:       br        2
+; CHECK:     end_try_table
+; CHECK:   end_block
+; CHECK:   local.set  1
+; CHECK:   global.set  __stack_pointer
+; CHECK:   call  _ZN4TempD2Ev
+; CHECK:   local.get  1
+; CHECK:   throw_ref
+; CHECK: end_block
+; CHECK: call  _ZN4TempD2Ev
+define void @cleanup() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  %t = alloca %struct.Temp, align 1
+  invoke void @foo()
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  %call = call ptr @_ZN4TempD2Ev(ptr %t)
+  ret void
+
+ehcleanup:                                        ; preds = %entry
+  %0 = cleanuppad within none []
+  %call1 = call ptr @_ZN4TempD2Ev(ptr %t) [ "funclet"(token %0) ]
+  cleanupret from %0 unwind to caller
+}
+
+; Calling a function that may throw within a 'catch (...)' generates a
+; terminatepad, because __cxa_end_catch() also can throw within 'catch (...)'.
+;
+; void foo();
+; void terminatepad() {
+;   try {
+;     foo();
+;   } catch (...) {
+;     foo();
+;   }
+; }
+
+; CHECK-LABEL: terminatepad
+; CHECK: block
+; CHECK:   block     i32
+; CHECK:     try_table    (catch __cpp_exception 0)
+; CHECK:       call  foo
+; CHECK:       br        2
+; CHECK:     end_try_table
+; CHECK:   end_block
+; CHECK:   call  __cxa_begin_catch
+; CHECK:   block
+; CHECK:     block     exnref
+; CHECK:       try_table    (catch_all_ref 0)
+; CHECK:         call  foo
+; CHECK:         br        2
+; CHECK:       end_try_table
+; CHECK:     end_block
+; CHECK:     local.set  2
+; CHECK:     block
+; CHECK:       block
+; CHECK:         try_table    (catch_all 0)
+; CHECK:           call  __cxa_end_catch
+; CHECK:           br        2
+; CHECK:         end_try_table
+; CHECK:       end_block
+; CHECK:       call  _ZSt9terminatev
+; CHECK:       unreachable
+; CHECK:     end_block
+; CHECK:     local.get  2
+; CHECK:     throw_ref
+; CHECK:   end_block
+; CHECK:   call  __cxa_end_catch
+; CHECK: end_block
+define void @terminatepad() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr null]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ]
+  invoke void @foo() [ "funclet"(token %1) ]
+          to label %invoke.cont1 unwind label %ehcleanup
+
+invoke.cont1:                                     ; preds = %catch.start
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont1, %entry
+  ret void
+
+ehcleanup:                                        ; preds = %catch.start
+  %5 = cleanuppad within %1 []
+  invoke void @__cxa_end_catch() [ "funclet"(token %5) ]
+          to label %invoke.cont2 unwind label %terminate
+
+invoke.cont2:                                     ; preds = %ehcleanup
+  cleanupret from %5 unwind to caller
+
+terminate:                                        ; preds = %ehcleanup
+  %6 = cleanuppad within %5 []
+  call void @_ZSt9terminatev() [ "funclet"(token %6) ]
+  unreachable
+}
+
+; Tests prologues and epilogues are not generated within EH scopes.
+; They should not be treated as funclets; BBs starting with a catch instruction
+; should not have a prologue, and BBs ending with a catchret/cleanupret should
+; not have an epilogue. This is separate from __stack_pointer restoring
+; instructions after a catch instruction.
+;
+; void bar(int) noexcept;
+; void no_prolog_epilog_in_ehpad() {
+;   int stack_var = 0;
+;   bar(stack_var);
+;   try {
+;     foo();
+;   } catch (int) {
+;     foo();
+;   }
+; }
+
+; CHECK-LABEL: no_prolog_epilog_in_ehpad
+; CHECK:   call  bar
+; CHECK:   block
+; CHECK:     block     () -> (i32, exnref)
+; CHECK:       try_table    (catch_ref __cpp_exception 0)
+; CHECK:         call  foo
+; CHECK:         br        2
+; CHECK:       end_try_table
+; CHECK:     end_block
+; CHECK:     local.set  2
+; CHECK-NOT: global.get  __stack_pointer
+; CHECK:     global.set  __stack_pointer
+; CHECK:     block
+; CHECK:       block
+; CHECK:         br_if     0
+; CHECK:         call  __cxa_begin_catch
+; CHECK:         block     exnref
+; CHECK:           try_table    (catch_all_ref 0)
+; CHECK:             call  foo
+; CHECK:             br        3
+; CHECK:           end_try_table
+; CHECK:         end_block
+; CHECK:         local.set  2
+; CHECK-NOT:     global.get  __stack_pointer
+; CHECK:         global.set  __stack_pointer
+; CHECK:         call  __cxa_end_catch
+; CHECK:         local.get  2
+; CHECK:         throw_ref
+; CHECK-NOT:     global.set  __stack_pointer
+; CHECK:       end_block
+; CHECK:       local.get  2
+; CHECK:       throw_ref
+; CHECK:     end_block
+; CHECK-NOT: global.set  __stack_pointer
+; CHECK:     call  __cxa_end_catch
+; CHECK:   end_block
+define void @no_prolog_epilog_in_ehpad() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  %stack_var = alloca i32, align 4
+  call void @bar(ptr %stack_var)
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr @_ZTIi]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch, label %rethrow
+
+catch:                                            ; preds = %catch.start
+  %5 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ]
+  %6 = load float, ptr %5, align 4
+  invoke void @foo() [ "funclet"(token %1) ]
+          to label %invoke.cont1 unwind label %ehcleanup
+
+invoke.cont1:                                     ; preds = %catch
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+rethrow:                                          ; preds = %catch.start
+  call void @llvm.wasm.rethrow() [ "funclet"(token %1) ]
+  unreachable
+
+try.cont:                                         ; preds = %invoke.cont1, %entry
+  ret void
+
+ehcleanup:                                        ; preds = %catch
+  %7 = cleanuppad within %1 []
+  call void @__cxa_end_catch() [ "funclet"(token %7) ]
+  cleanupret from %7 unwind to caller
+}
+
+; When a function does not have stack-allocated objects, it does not need to
+; store SP back to __stack_pointer global at the epilog.
+;
+; void foo();
+; void no_sp_writeback() {
+;   try {
+;     foo();
+;   } catch (...) {
+;   }
+; }
+
+; CHECK-LABEL: no_sp_writeback
+; CHECK:     block
+; CHECK:       block     i32
+; CHECK:         try_table    (catch __cpp_exception 0)
+; CHECK:           call  foo
+; CHECK:           br        2
+; CHECK:         end_try_table
+; CHECK:       end_block
+; CHECK:       call  __cxa_begin_catch
+; CHECK:       call  __cxa_end_catch
+; CHECK:     end_block
+; CHECK-NOT: global.set  __stack_pointer
+; CHECK:     end_function
+define void @no_sp_writeback() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr null]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ]
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %catch.start, %entry
+  ret void
+}
+
+; When the result of @llvm.wasm.get.exception is not used. This is created to
+; fix a bug in LateEHPrepare and this should not crash.
+define void @get_exception_wo_use() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr null]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %catch.start, %entry
+  ret void
+}
+
+; Tests a case when a cleanup region (cleanuppad ~ clanupret) contains another
+; catchpad
+define void @complex_cleanup_region() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  ret void
+
+ehcleanup:                                        ; preds = %entry
+  %0 = cleanuppad within none []
+  invoke void @foo() [ "funclet"(token %0) ]
+          to label %ehcleanupret unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %ehcleanup
+  %1 = catchswitch within %0 [label %catch.start] unwind label %ehcleanup.1
+
+catch.start:                                      ; preds = %catch.dispatch
+  %2 = catchpad within %1 [ptr null]
+  %3 = call ptr @llvm.wasm.get.exception(token %2)
+  %4 = call i32 @llvm.wasm.get.ehselector(token %2)
+  catchret from %2 to label %ehcleanupret
+
+ehcleanup.1:                                      ; preds = %catch.dispatch
+  %5 = cleanuppad within %0 []
+  unreachable
+
+ehcleanupret:                                     ; preds = %catch.start, %ehcleanup
+  cleanupret from %0 unwind to caller
+}
+
+; Regression test for the bug that 'rethrow' was not treated correctly as a
+; terminator in isel.
+define void @rethrow_terminator() personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind label %ehcleanup
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr @_ZTIi]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch, label %rethrow
+
+catch:                                            ; preds = %catch.start
+  %5 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ]
+  %6 = load i32, ptr %5, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+rethrow:                                          ; preds = %catch.start
+  invoke void @llvm.wasm.rethrow() #1 [ "funclet"(token %1) ]
+          to label %unreachable unwind label %ehcleanup
+
+try.cont:                                         ; preds = %entry, %catch
+  ret void
+
+ehcleanup:                                        ; preds = %rethrow, %catch.dispatch
+  ; 'rethrow' BB is this BB's predecessor, and its
+  ; 'invoke void @llvm.wasm.rethrow()' is lowered down to a 'RETHROW' in Wasm
+  ; MIR. And this 'phi' creates 'CONST_I32' instruction in the predecessor
+  ; 'rethrow' BB. If 'RETHROW' is not treated correctly as a terminator, it can
+  ; create a BB like
+  ; bb.3.rethrow:
+  ;   RETHROW 0
+  ;   %0 = CONST_I32 20
+  ;   BR ...
+  %tmp = phi i32 [ 10, %catch.dispatch ], [ 20, %rethrow ]
+  %7 = cleanuppad within none []
+  call void @take_i32(i32 %tmp) [ "funclet"(token %7) ]
+  cleanupret from %7 unwind to caller
+
+unreachable:                                      ; preds = %rethrow
+  unreachable
+}
+
+
+declare void @foo()
+declare void @bar(ptr)
+declare void @take_i32(i32)
+declare i32 @__gxx_wasm_personality_v0(...)
+; Function Attrs: noreturn
+declare void @llvm.wasm.throw(i32, ptr) #1
+; Function Attrs: nounwind
+declare ptr @llvm.wasm.get.exception(token) #0
+; Function Attrs: nounwind
+declare i32 @llvm.wasm.get.ehselector(token) #0
+; Function Attrs: noreturn
+declare void @llvm.wasm.rethrow() #1
+; Function Attrs: nounwind
+declare i32 @llvm.eh.typeid.for(ptr) #0
+declare ptr @__cxa_begin_catch(ptr)
+declare void @__cxa_end_catch()
+declare void @_ZSt9terminatev()
+declare ptr @_ZN4TempD2Ev(ptr returned)
+
+attributes #0 = { nounwind }
+attributes #1 = { noreturn }
+
+; CHECK: __cpp_exception:

From d03822d8887adc9312e65abf8d8ce1a16007f2a0 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Wed, 11 Sep 2024 07:21:49 +0200
Subject: [PATCH 055/114] [clang][bytecode] Fix lookup of source locations in
 implicit ctors (#107992)

Implicit functions may still have a body. The !hasBody() check is
enough.
---
 clang/lib/AST/ByteCode/InterpFrame.cpp        | 19 ++++++++++++++-----
 clang/test/AST/ByteCode/builtin-functions.cpp |  7 +++++++
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp
index 6830a7b37f1da..28e189bb339e6 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.cpp
+++ b/clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -207,31 +207,40 @@ Pointer InterpFrame::getParamPointer(unsigned Off) {
   return Pointer(B);
 }
 
+static bool funcHasUsableBody(const Function *F) {
+  assert(F);
+
+  if (F->isConstructor() || F->isDestructor())
+    return true;
+
+  return !F->getDecl()->isImplicit();
+}
+
 SourceInfo InterpFrame::getSource(CodePtr PC) const {
   // Implicitly created functions don't have any code we could point at,
   // so return the call site.
-  if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller)
+  if (Func && !funcHasUsableBody(Func) && Caller)
     return Caller->getSource(RetPC);
 
   return S.getSource(Func, PC);
 }
 
 const Expr *InterpFrame::getExpr(CodePtr PC) const {
-  if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller)
-    return Caller->getExpr(RetPC);
+  if (Func && !funcHasUsableBody(Func) && Caller)
+    return Caller->getExpr(PC);
 
   return S.getExpr(Func, PC);
 }
 
 SourceLocation InterpFrame::getLocation(CodePtr PC) const {
-  if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller)
+  if (Func && !funcHasUsableBody(Func) && Caller)
     return Caller->getLocation(RetPC);
 
   return S.getLocation(Func, PC);
 }
 
 SourceRange InterpFrame::getRange(CodePtr PC) const {
-  if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller)
+  if (Func && !funcHasUsableBody(Func) && Caller)
     return Caller->getRange(RetPC);
 
   return S.getRange(Func, PC);
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 9c9ca23e0a6a6..9fd5eae67a21f 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -968,3 +968,10 @@ namespace FunctionStart {
   static_assert(__builtin_function_start(a) == a, ""); // both-error {{not an integral constant expression}} \
                                                        // both-note {{comparison of addresses of literals has unspecified value}}
 }
+
+namespace BuiltinInImplicitCtor {
+  constexpr struct {
+    int a = __builtin_isnan(1.0);
+  } Foo;
+  static_assert(Foo.a == 0, "");
+}

From 323911de277087b4898a96760c065a28f5d1bfa7 Mon Sep 17 00:00:00 2001
From: ChiaHungDuan <chiahungduan@google.com>
Date: Tue, 10 Sep 2024 22:24:06 -0700
Subject: [PATCH 056/114] Reapply "[scudo] Fix the logic of
 MaxAllowedFragmentedPages" (#108130) (#108134)

This reverts commit 76151c449080b7239c8b442291514a4300d51cba.

Also changed to check MaxAllowedFragmentedPages.
---
 compiler-rt/lib/scudo/standalone/secondary.h | 25 +++++++++++++-------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 1a232b9b9fb2d..2fae29e5a2168 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -72,13 +72,16 @@ namespace {
 struct CachedBlock {
   static constexpr u16 CacheIndexMax = UINT16_MAX;
   static constexpr u16 InvalidEntry = CacheIndexMax;
-  //   * MaxReleasedCachePages default is currently 4
-  //        - We arrived at this value after noticing that mapping
-  //        in larger memory regions performs better than releasing
-  //        memory and forcing a cache hit. According to the data,
-  //        it suggests that beyond 4 pages, the release execution time is
-  //        longer than the map execution time. In this way, the default
-  //        is dependent on the platform.
+  // We allow a certain amount of fragmentation and part of the fragmented bytes
+  // will be released by `releaseAndZeroPagesToOS()`. This increases the chance
+  // of cache hit rate and reduces the overhead to the RSS at the same time. See
+  // more details in the `MapAllocatorCache::retrieve()` section.
+  //
+  // We arrived at this default value after noticing that mapping in larger
+  // memory regions performs better than releasing memory and forcing a cache
+  // hit. According to the data, it suggests that beyond 4 pages, the release
+  // execution time is longer than the map execution time. In this way,
+  // the default is dependent on the platform.
   static constexpr uptr MaxReleasedCachePages = 4U;
 
   uptr CommitBase = 0;
@@ -725,8 +728,14 @@ MapAllocator<Config>::tryAllocateFromCache(const Options &Options, uptr Size,
   uptr EntryHeaderPos;
   uptr MaxAllowedFragmentedPages = MaxUnreleasedCachePages;
 
-  if (UNLIKELY(useMemoryTagging<Config>(Options)))
+  if (LIKELY(!useMemoryTagging<Config>(Options))) {
     MaxAllowedFragmentedPages += CachedBlock::MaxReleasedCachePages;
+  } else {
+    // TODO: Enable MaxReleasedCachePages may result in pages for an entry being
+    // partially released and it erases the tag of those pages as well. To
+    // support this feature for MTE, we need to tag those pages again.
+    DCHECK_EQ(MaxAllowedFragmentedPages, MaxUnreleasedCachePages);
+  }
 
   Entry = Cache.retrieve(MaxAllowedFragmentedPages, Size, Alignment,
                          getHeadersSize(), EntryHeaderPos);

From 203a2ca8cd6af505e11a38aebceeaf864271042c Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Tue, 10 Sep 2024 22:25:03 -0700
Subject: [PATCH 057/114] [webkit.RefCntblBaseVirtualDtor] Make
 ThreadSafeRefCounted not generate warnings (#107676)

This PR makes WebKit's RefCntblBaseVirtualDtor checker not generate a
warning for ThreadSafeRefCounted when the destruction thread is a
specific thread.

Prior to this PR, we only allowed CRTP classes without a virtual
destructor if its deref function had an explicit cast to the derived
type, skipping any lambda declarations which aren't invoked. This ends
up generating a warning for ThreadSafeRefCounted when a specific thread
is used to destruct the object because there is no inline body /
definition for ensureOnMainThread and ensureOnMainRunLoop and
DerefFuncDeleteExprVisitor concludes that there is no explicit delete of
the derived type.

This PR relaxes the condition DerefFuncDeleteExprVisitor checks by
allowing a delete expression to appear within a lambda declaration if
it's an argument to an "opaque" function; i.e. a function without
definition / body.
---
 .../WebKit/RefCntblBaseVirtualDtorChecker.cpp |  26 ++
 .../ref-cntbl-crtp-base-no-virtual-dtor.cpp   | 232 ++++++++++++++++++
 2 files changed, 258 insertions(+)
 create mode 100644 clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
index 9df108e28ecdb..ecba5f9aa23ee 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
@@ -67,6 +67,32 @@ class DerefFuncDeleteExprVisitor
     const Decl *D = CE->getCalleeDecl();
     if (D && D->hasBody())
       return VisitBody(D->getBody());
+    else {
+      auto name = safeGetName(D);
+      if (name == "ensureOnMainThread" || name == "ensureOnMainRunLoop") {
+        for (unsigned i = 0; i < CE->getNumArgs(); ++i) {
+          auto *Arg = CE->getArg(i);
+          if (VisitLabmdaArgument(Arg))
+            return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  bool VisitLabmdaArgument(const Expr *E) {
+    E = E->IgnoreParenCasts();
+    if (auto *TempE = dyn_cast<CXXBindTemporaryExpr>(E))
+      E = TempE->getSubExpr();
+    if (auto *ConstructE = dyn_cast<CXXConstructExpr>(E)) {
+      for (unsigned i = 0; i < ConstructE->getNumArgs(); ++i) {
+        auto *Arg = ConstructE->getArg(i);
+        if (auto *Lambda = dyn_cast<LambdaExpr>(Arg)) {
+          if (VisitBody(Lambda->getBody()))
+            return true;
+        }
+      }
+    }
     return false;
   }
 
diff --git a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
new file mode 100644
index 0000000000000..01527addb5299
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
@@ -0,0 +1,232 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.RefCntblBaseVirtualDtor -verify %s
+
+#include "mock-types.h"
+
+namespace Detail {
+
+template<typename Out, typename... In>
+class CallableWrapperBase {
+public:
+    virtual ~CallableWrapperBase() { }
+    virtual Out call(In...) = 0;
+};
+
+template<typename, typename, typename...> class CallableWrapper;
+
+template<typename CallableType, typename Out, typename... In>
+class CallableWrapper : public CallableWrapperBase<Out, In...> {
+public:
+    explicit CallableWrapper(CallableType&& callable)
+        : m_callable(WTFMove(callable)) { }
+    CallableWrapper(const CallableWrapper&) = delete;
+    CallableWrapper& operator=(const CallableWrapper&) = delete;
+    Out call(In... in) final;
+private:
+    CallableType m_callable;
+};
+
+} // namespace Detail
+
+template<typename> class Function;
+
+template<typename Out, typename... In> Function<Out(In...)> adopt(Detail::CallableWrapperBase<Out, In...>*);
+
+template <typename Out, typename... In>
+class Function<Out(In...)> {
+public:
+    using Impl = Detail::CallableWrapperBase<Out, In...>;
+
+    Function() = default;
+
+    template<typename FunctionType>
+    Function(FunctionType f);
+
+    Out operator()(In... in) const;
+    explicit operator bool() const { return !!m_callableWrapper; }
+
+private:
+    enum AdoptTag { Adopt };
+    Function(Impl* impl, AdoptTag)
+        : m_callableWrapper(impl)
+    {
+    }
+
+    friend Function adopt<Out, In...>(Impl*);
+
+    Impl* m_callableWrapper;
+};
+
+template<typename Out, typename... In> Function<Out(In...)> adopt(Detail::CallableWrapperBase<Out, In...>* impl)
+{
+    return Function<Out(In...)>(impl, Function<Out(In...)>::Adopt);
+}
+
+template<typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTraits = DefaultRefDerefTraits<T>> Ref<T, PtrTraits, RefDerefTraits> adoptRef(T&);
+
+template<typename T, typename _PtrTraits, typename RefDerefTraits>
+inline Ref<T, _PtrTraits, RefDerefTraits> adoptRef(T& reference)
+{
+    return Ref<T, _PtrTraits, RefDerefTraits>(reference);
+}
+
+enum class DestructionThread : unsigned char { Any, Main, MainRunLoop };
+void ensureOnMainThread(Function<void()>&&); // Sync if called on main thread, async otherwise.
+void ensureOnMainRunLoop(Function<void()>&&); // Sync if called on main run loop, async otherwise.
+
+class ThreadSafeRefCountedBase {
+public:
+    ThreadSafeRefCountedBase() = default;
+
+    void ref() const
+    {
+        ++m_refCount;
+    }
+
+    bool hasOneRef() const
+    {
+        return refCount() == 1;
+    }
+
+    unsigned refCount() const
+    {
+        return m_refCount;
+    }
+
+protected:
+    bool derefBase() const
+    {
+      if (!--m_refCount) {
+          m_refCount = 1;
+          return true;
+      }
+      return false;
+    }
+
+private:
+    mutable unsigned m_refCount { 1 };
+};
+
+template<class T, DestructionThread destructionThread = DestructionThread::Any> class ThreadSafeRefCounted : public ThreadSafeRefCountedBase {
+public:
+    void deref() const
+    {
+        if (!derefBase())
+            return;
+
+        if constexpr (destructionThread == DestructionThread::Any) {
+            delete static_cast<const T*>(this);
+        } else if constexpr (destructionThread == DestructionThread::Main) {
+            ensureOnMainThread([this] {
+                delete static_cast<const T*>(this);
+            });
+        }
+    }
+
+protected:
+    ThreadSafeRefCounted() = default;
+};
+
+class FancyRefCountedClass final : public ThreadSafeRefCounted<FancyRefCountedClass, DestructionThread::Main> {
+public:
+    static Ref<FancyRefCountedClass> create()
+    {
+        return adoptRef(*new FancyRefCountedClass());
+    }
+
+    virtual ~FancyRefCountedClass();
+
+private:
+    FancyRefCountedClass();
+};
+
+template<class T, DestructionThread destructionThread = DestructionThread::Any> class BadThreadSafeRefCounted : public ThreadSafeRefCountedBase {
+public:
+    void deref() const
+    {
+        if (!derefBase())
+            return;
+
+        [this] {
+          delete static_cast<const T*>(this);
+        };
+    }
+
+protected:
+    BadThreadSafeRefCounted() = default;
+};
+
+class FancyRefCountedClass2 final : public ThreadSafeRefCounted<FancyRefCountedClass, DestructionThread::Main> {
+// expected-warning@-1{{Class 'ThreadSafeRefCounted<FancyRefCountedClass, DestructionThread::Main>' is used as a base of class 'FancyRefCountedClass2' but doesn't have virtual destructor}}
+public:
+    static Ref<FancyRefCountedClass2> create()
+    {
+        return adoptRef(*new FancyRefCountedClass2());
+    }
+
+    virtual ~FancyRefCountedClass2();
+
+private:
+    FancyRefCountedClass2();
+};
+
+template<class T, DestructionThread destructionThread = DestructionThread::Any> class NestedThreadSafeRefCounted : public ThreadSafeRefCountedBase {
+public:
+    void deref() const
+    {
+        if (!derefBase())
+            return;
+        ensureOnMainRunLoop([&] {
+          auto destroyThis = [&] {
+            delete static_cast<const T*>(this);
+          };
+          destroyThis();
+        });
+    }
+
+protected:
+    NestedThreadSafeRefCounted() = default;
+};
+
+class FancyRefCountedClass3 final : public NestedThreadSafeRefCounted<FancyRefCountedClass3, DestructionThread::Main> {
+public:
+    static Ref<FancyRefCountedClass3> create()
+    {
+        return adoptRef(*new FancyRefCountedClass3());
+    }
+
+    virtual ~FancyRefCountedClass3();
+
+private:
+    FancyRefCountedClass3();
+};
+
+template<class T, DestructionThread destructionThread = DestructionThread::Any> class BadNestedThreadSafeRefCounted : public ThreadSafeRefCountedBase {
+public:
+    void deref() const
+    {
+        if (!derefBase())
+            return;
+        ensureOnMainThread([&] {
+          auto destroyThis = [&] {
+            delete static_cast<const T*>(this);
+          };
+        });
+    }
+
+protected:
+    BadNestedThreadSafeRefCounted() = default;
+};
+
+class FancyRefCountedClass4 final : public BadNestedThreadSafeRefCounted<FancyRefCountedClass4, DestructionThread::Main> {
+// expected-warning@-1{{Class 'BadNestedThreadSafeRefCounted<FancyRefCountedClass4, DestructionThread::Main>' is used as a base of class 'FancyRefCountedClass4' but doesn't have virtual destructor}}
+public:
+    static Ref<FancyRefCountedClass4> create()
+    {
+        return adoptRef(*new FancyRefCountedClass4());
+    }
+
+    virtual ~FancyRefCountedClass4();
+
+private:
+    FancyRefCountedClass4();
+};

From 3c9022c965b85951f30af140da591f819acef8a0 Mon Sep 17 00:00:00 2001
From: AdityaK <hiraditya@msn.com>
Date: Tue, 10 Sep 2024 22:39:02 -0700
Subject: [PATCH 058/114] Bail out jump threading on indirect branches
 (#103688)

The bug was introduced by
https://github.com/llvm/llvm-project/pull/68473

Fixes: #102351
---
 llvm/lib/Transforms/Utils/Local.cpp           |  11 +-
 .../switch-branch-fold-indirectbr-102351.ll   | 104 ++++++++++++++++++
 2 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll

diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index d0669e44f821b..c85c819263e2a 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1028,7 +1028,14 @@ CanRedirectPredsOfEmptyBBToSucc(BasicBlock *BB, BasicBlock *Succ,
   if (!BB->hasNPredecessorsOrMore(2))
     return false;
 
-  // Get single common predecessors of both BB and Succ
+  if (any_of(BBPreds, [](const BasicBlock *Pred) {
+        return isa<PHINode>(Pred->begin()) &&
+               isa<IndirectBrInst>(Pred->getTerminator());
+      }))
+    return false;
+
+  // Get the single common predecessor of both BB and Succ. Return false
+  // when there are more than one common predecessors.
   for (BasicBlock *SuccPred : SuccPreds) {
     if (BBPreds.count(SuccPred)) {
       if (CommonPred)
@@ -1133,7 +1140,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
 
   bool BBKillable = CanPropagatePredecessorsForPHIs(BB, Succ, BBPreds);
 
-  // Even if we can not fold bB into Succ, we may be able to redirect the
+  // Even if we can not fold BB into Succ, we may be able to redirect the
   // predecessors of BB to Succ.
   bool BBPhisMergeable =
       BBKillable ||
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll b/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll
new file mode 100644
index 0000000000000..03aee68fa4248
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=simplifycfg -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local noundef i32 @main() {
+; CHECK-LABEL: define dso_local noundef i32 @main() {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x ptr], align 16
+; CHECK-NEXT:    store ptr blockaddress(@main, %[[BB4:.*]]), ptr [[ALLOCA]], align 16, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds [2 x ptr], ptr [[ALLOCA]], i64 0, i64 1
+; CHECK-NEXT:    store ptr blockaddress(@main, %[[BB10:.*]]), ptr [[GETELEMENTPTR]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[PHI8:%.*]], %[[BB7:.*]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[PHI9:%.*]], %[[BB7]] ]
+; CHECK-NEXT:    switch i32 [[PHI]], label %[[BB7]] [
+; CHECK-NEXT:      i32 0, label %[[BB12:.*]]
+; CHECK-NEXT:      i32 1, label %[[BB4]]
+; CHECK-NEXT:      i32 2, label %[[BB6:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[PHI5:%.*]] = phi i32 [ [[PHI13:%.*]], %[[BB12]] ], [ [[PHI2]], %[[BB1]] ]
+; CHECK-NEXT:    br label %[[BB7]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @foo(i32 noundef [[PHI2]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[PHI2]], 1
+; CHECK-NEXT:    br label %[[BB12]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[PHI8]] = phi i32 [ [[PHI]], %[[BB1]] ], [ 2, %[[BB4]] ]
+; CHECK-NEXT:    [[PHI9]] = phi i32 [ [[PHI2]], %[[BB1]] ], [ [[PHI5]], %[[BB4]] ]
+; CHECK-NEXT:    br label %[[BB1]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[CALL11:%.*]] = call i32 @foo(i32 noundef [[PHI13]])
+; CHECK-NEXT:    ret i32 0
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[PHI13]] = phi i32 [ [[ADD]], %[[BB6]] ], [ [[PHI2]], %[[BB1]] ]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[PHI13]] to i64
+; CHECK-NEXT:    [[GETELEMENTPTR14:%.*]] = getelementptr inbounds [2 x ptr], ptr [[ALLOCA]], i64 0, i64 [[SEXT]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[GETELEMENTPTR14]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    indirectbr ptr [[LOAD]], [label %[[BB4]], label %bb10]
+;
+bb:
+  %alloca = alloca [2 x ptr], align 16
+  store ptr blockaddress(@main, %bb4), ptr %alloca, align 16, !tbaa !0
+  %getelementptr = getelementptr inbounds [2 x ptr], ptr %alloca, i64 0, i64 1
+  store ptr blockaddress(@main, %bb10), ptr %getelementptr, align 8, !tbaa !0
+  br label %bb1
+
+bb1:                                              ; preds = %bb7, %bb
+  %phi = phi i32 [ 0, %bb ], [ %phi8, %bb7 ]
+  %phi2 = phi i32 [ 0, %bb ], [ %phi9, %bb7 ]
+  switch i32 %phi, label %bb7 [
+  i32 0, label %bb3
+  i32 1, label %bb4
+  i32 2, label %bb6
+  ]
+
+bb3:                                              ; preds = %bb1
+  br label %bb12
+
+bb4:                                              ; preds = %bb12, %bb1
+  %phi5 = phi i32 [ %phi13, %bb12 ], [ %phi2, %bb1 ]
+  br label %bb7
+
+bb6:                                              ; preds = %bb1
+  %call = call i32 @foo(i32 noundef %phi2)
+  %add = add nsw i32 %phi2, 1
+  br label %bb12
+
+bb7:                                              ; preds = %bb4, %bb1
+  %phi8 = phi i32 [ %phi, %bb1 ], [ 2, %bb4 ]
+  %phi9 = phi i32 [ %phi2, %bb1 ], [ %phi5, %bb4 ]
+  br label %bb1, !llvm.loop !4
+
+bb10:                                             ; preds = %bb12
+  %call11 = call i32 @foo(i32 noundef %phi13)
+  ret i32 0
+
+bb12:                                             ; preds = %bb6, %bb3
+  %phi13 = phi i32 [ %add, %bb6 ], [ %phi2, %bb3 ]
+  %sext = sext i32 %phi13 to i64
+  %getelementptr14 = getelementptr inbounds [2 x ptr], ptr %alloca, i64 0, i64 %sext
+  %load = load ptr, ptr %getelementptr14, align 8, !tbaa !0
+  indirectbr ptr %load, [label %bb4, label %bb10]
+}
+
+declare i32 @foo(i32)
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"any pointer", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !2, i64 0}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"any pointer", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK: [[LOOP4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"int", [[META2]], i64 0}
+;.

From bc152fbf43157659f8b6817e8510e1fbe6e175b5 Mon Sep 17 00:00:00 2001
From: Prabhuk <prabhukr@google.com>
Date: Tue, 10 Sep 2024 23:04:05 -0700
Subject: [PATCH 059/114] [llvm-debuginfod-find] Enable multicall driver
 (#108082)

Migrate llvm-debuginfod-find tool to use GenericOptTable.
Enable multicall driver.
---
 .../tools/llvm-debuginfod-find/CMakeLists.txt |  12 +-
 llvm/tools/llvm-debuginfod-find/Opts.td       |  17 +++
 .../llvm-debuginfod-find.cpp                  | 109 ++++++++++++++----
 3 files changed, 115 insertions(+), 23 deletions(-)
 create mode 100644 llvm/tools/llvm-debuginfod-find/Opts.td

diff --git a/llvm/tools/llvm-debuginfod-find/CMakeLists.txt b/llvm/tools/llvm-debuginfod-find/CMakeLists.txt
index b98c431c1839b..39da11fcd9599 100644
--- a/llvm/tools/llvm-debuginfod-find/CMakeLists.txt
+++ b/llvm/tools/llvm-debuginfod-find/CMakeLists.txt
@@ -1,11 +1,21 @@
 set(LLVM_LINK_COMPONENTS
+  Option
   Object
   Support
   )
+set(LLVM_TARGET_DEFINITIONS Opts.td)
+tablegen(LLVM Opts.inc -gen-opt-parser-defs)
+add_public_tablegen_target(DebugInfodFindOptsTableGen)
+
 add_llvm_tool(llvm-debuginfod-find
   llvm-debuginfod-find.cpp
+  DEPENDS
+  DebugInfodFindOptsTableGen
+  GENERATE_DRIVER
   )
-target_link_libraries(llvm-debuginfod-find PRIVATE LLVMDebuginfod)
+if(NOT LLVM_TOOL_LLVM_DRIVER_BUILD)
+  target_link_libraries(llvm-debuginfod-find PRIVATE LLVMDebuginfod)
+endif()
 if(LLVM_INSTALL_BINUTILS_SYMLINKS)
   add_llvm_tool_symlink(debuginfod-find llvm-debuginfod-find)
 endif()
diff --git a/llvm/tools/llvm-debuginfod-find/Opts.td b/llvm/tools/llvm-debuginfod-find/Opts.td
new file mode 100644
index 0000000000000..a770f50d241a2
--- /dev/null
+++ b/llvm/tools/llvm-debuginfod-find/Opts.td
@@ -0,0 +1,17 @@
+include "llvm/Option/OptParser.td"
+
+class F<string name, string help> : Flag<["-"], name>, HelpText<help>;
+class FF<string name, string help>: Flag<["--"], name>, HelpText<help>;
+class S<string name, string meta, string help>: Separate<["--"], name>, HelpText<help>, MetaVarName<meta>;
+
+def help : FF<"help", "Display available options">;
+def : F<"h", "Alias for --help">, Alias<help>;
+
+def fetch_executable : FF<"executable", "If set, fetch a binary file associated with this build id, containing the executable sections.">;
+def fetch_debuginfo : FF<"debuginfo", "If set, fetch a binary file associated with this build id, containing the debuginfo sections.">;
+def fetch_source : S<"source", "<string>", "Fetch a source file associated with this build id, which is at this relative path relative to the compilation directory.">;
+def dump_to_stdout : FF<"dump", "If set, dumps the contents of the fetched artifact "
+                          "to standard output. Otherwise, dumps the absolute "
+                          "path to the cached artifact on disk.">;
+def debug_file_directory : S<"debug-file-directory", "<string>", "Path to directory where to look for debug files.">;
+
diff --git a/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp b/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp
index 425ee8d986a82..1f4404aaa391f 100644
--- a/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp
+++ b/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp
@@ -16,14 +16,89 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Debuginfod/BuildIDFetcher.h"
 #include "llvm/Debuginfod/Debuginfod.h"
 #include "llvm/Debuginfod/HTTPClient.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/LLVMDriver.h"
 
 using namespace llvm;
 
+// Command-line option boilerplate.
+namespace {
+enum ID {
+  OPT_INVALID = 0, // This is not an option ID.
+#define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__),
+#include "Opts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE)                                                    \
+  static constexpr StringLiteral NAME##_init[] = VALUE;                        \
+  static constexpr ArrayRef<StringLiteral> NAME(NAME##_init,                   \
+                                                std::size(NAME##_init) - 1);
+#include "Opts.inc"
+#undef PREFIX
+
+using namespace llvm::opt;
+static constexpr opt::OptTable::Info InfoTable[] = {
+#define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__),
+#include "Opts.inc"
+#undef OPTION
+};
+
+class DebuginfodFindOptTable : public opt::GenericOptTable {
+public:
+  DebuginfodFindOptTable() : GenericOptTable(InfoTable) {}
+};
+
+} // end anonymous namespace
+
+static std::string InputBuildID;
+static bool FetchExecutable;
+static bool FetchDebuginfo;
+static std::string FetchSource;
+static bool DumpToStdout;
+static std::vector<std::string> DebugFileDirectory;
+
+static void parseArgs(int argc, char **argv) {
+  DebuginfodFindOptTable Tbl;
+  llvm::StringRef ToolName = argv[0];
+  llvm::BumpPtrAllocator A;
+  llvm::StringSaver Saver{A};
+  opt::InputArgList Args =
+      Tbl.parseArgs(argc, argv, OPT_UNKNOWN, Saver, [&](StringRef Msg) {
+        llvm::errs() << Msg << '\n';
+        std::exit(1);
+      });
+
+  if (Args.hasArg(OPT_help)) {
+    Tbl.printHelp(llvm::outs(),
+                  "llvm-debuginfod-find [options] <input build_id>",
+                  ToolName.str().c_str());
+    std::exit(0);
+  }
+
+  InputBuildID = Args.getLastArgValue(OPT_INPUT);
+
+  FetchExecutable = Args.hasArg(OPT_fetch_executable);
+  FetchDebuginfo = Args.hasArg(OPT_fetch_debuginfo);
+  DumpToStdout = Args.hasArg(OPT_dump_to_stdout);
+  FetchSource = Args.getLastArgValue(OPT_fetch_source, "");
+  DebugFileDirectory = Args.getAllArgValues(OPT_debug_file_directory);
+}
+
+[[noreturn]] static void helpExit() {
+  errs() << "Must specify exactly one of --executable, "
+            "--source=/path/to/file, or --debuginfo.\n";
+  exit(1);
+}
+
+/*
 cl::OptionCategory DebuginfodFindCategory("llvm-debuginfod-find Options");
 
 cl::opt<std::string> InputBuildID(cl::Positional, cl::Required,
@@ -60,30 +135,17 @@ static cl::list<std::string> DebugFileDirectory(
     cl::desc("Path to directory where to look for debug files."),
     cl::cat(DebuginfodFindCategory));
 
-[[noreturn]] static void helpExit() {
-  errs() << "Must specify exactly one of --executable, "
-            "--source=/path/to/file, or --debuginfo.";
-  exit(1);
-}
+*/
 
-ExitOnError ExitOnErr;
+ExitOnError ExitOnDebuginfodFindError;
 
 static std::string fetchDebugInfo(object::BuildIDRef BuildID);
 
-int main(int argc, char **argv) {
-  InitLLVM X(argc, argv);
+int llvm_debuginfod_find_main(int argc, char **argv,
+                              const llvm::ToolContext &) {
+  // InitLLVM X(argc, argv);
   HTTPClient::initialize();
-
-  cl::HideUnrelatedOptions({&DebuginfodFindCategory});
-  cl::ParseCommandLineOptions(
-      argc, argv,
-      "llvm-debuginfod-find: Fetch debuginfod artifacts\n\n"
-      "This program is a frontend to the debuginfod client library. The cache "
-      "directory, request timeout (in seconds), and debuginfod server urls are "
-      "set by these environment variables:\n"
-      "DEBUGINFOD_CACHE_PATH (default set by sys::path::cache_directory)\n"
-      "DEBUGINFOD_TIMEOUT (defaults to 90s)\n"
-      "DEBUGINFOD_URLS=[comma separated URLs] (defaults to empty)\n");
+  parseArgs(argc, argv);
 
   if (FetchExecutable + FetchDebuginfo + (FetchSource != "") != 1)
     helpExit();
@@ -97,9 +159,10 @@ int main(int argc, char **argv) {
 
   std::string Path;
   if (FetchSource != "")
-    Path = ExitOnErr(getCachedOrDownloadSource(ID, FetchSource));
+    Path =
+        ExitOnDebuginfodFindError(getCachedOrDownloadSource(ID, FetchSource));
   else if (FetchExecutable)
-    Path = ExitOnErr(getCachedOrDownloadExecutable(ID));
+    Path = ExitOnDebuginfodFindError(getCachedOrDownloadExecutable(ID));
   else if (FetchDebuginfo)
     Path = fetchDebugInfo(ID);
   else
@@ -110,11 +173,13 @@ int main(int argc, char **argv) {
     // Print the contents of the artifact.
     ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(
         Path, /*IsText=*/false, /*RequiresNullTerminator=*/false);
-    ExitOnErr(errorCodeToError(Buf.getError()));
+    ExitOnDebuginfodFindError(errorCodeToError(Buf.getError()));
     outs() << Buf.get()->getBuffer();
   } else
     // Print the path to the cached artifact file.
     outs() << Path << "\n";
+
+  return 0;
 }
 
 // Find a debug file in local build ID directories and via debuginfod.

From 6dbdb8430b492959c399a7809247424c6962902f Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Sep 2024 08:47:24 +0200
Subject: [PATCH 060/114] [Clang] Fix crash due to invalid source location in
 __is_trivially_equality_comparable (#107815)

Fixes #107777
---
 clang/lib/Sema/SemaExprCXX.cpp     |  3 ++-
 clang/test/SemaCXX/type-traits.cpp | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 14feafd1e6b17..a14a086731c13 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -5171,7 +5171,8 @@ static bool HasNonDeletedDefaultedEqualityComparison(Sema &S,
 
     // const ClassT& obj;
     OpaqueValueExpr Operand(
-        {}, Decl->getTypeForDecl()->getCanonicalTypeUnqualified().withConst(),
+        KeyLoc,
+        Decl->getTypeForDecl()->getCanonicalTypeUnqualified().withConst(),
         ExprValueKind::VK_LValue);
     UnresolvedSet<16> Functions;
     // obj == obj;
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index b8a9db103782c..91ef7786f11bb 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -4147,6 +4147,24 @@ class Template {};
 // Make sure we don't crash when instantiating a type
 static_assert(!__is_trivially_equality_comparable(Template<Template<int>>));
 
+
+struct S operator==(S, S);
+
+template <class> struct basic_string_view {};
+
+struct basic_string {
+  operator basic_string_view<int>() const;
+};
+
+template <class T>
+const bool is_trivially_equality_comparable = __is_trivially_equality_comparable(T);
+
+template <int = is_trivially_equality_comparable<basic_string> >
+void find();
+
+void func() { find(); }
+
+
 namespace hidden_friend {
 
 struct TriviallyEqualityComparable {

From cd0e867756dbab6184d2f250f768a60bc60a0849 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 11 Sep 2024 07:50:39 +0100
Subject: [PATCH 061/114] [AArch64] Update and cleanup arm64-vector-imm.ll
 test. NFC

---
 llvm/test/CodeGen/AArch64/arm64-vector-imm.ll | 147 +++++++++++-------
 1 file changed, 89 insertions(+), 58 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll b/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll
index 08bceb850df40..a3efa6b961e63 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll
@@ -1,134 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @v_orrimm(ptr %A) nounwind {
 ; CHECK-LABEL: v_orrimm:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: orr
-	%tmp1 = load <8 x i8>, ptr %A
-	%tmp3 = or <8 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
-	ret <8 x i8> %tmp3
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    orr.2s v0, #1, lsl #24
+; CHECK-NEXT:    ret
+  %tmp1 = load <8 x i8>, ptr %A
+  %tmp3 = or <8 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+  ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @v_orrimmQ(ptr %A) nounwind {
-; CHECK: v_orrimmQ
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: orr
-	%tmp1 = load <16 x i8>, ptr %A
-	%tmp3 = or <16 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
-	ret <16 x i8> %tmp3
+; CHECK-LABEL: v_orrimmQ:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    orr.4s v0, #1, lsl #24
+; CHECK-NEXT:    ret
+  %tmp1 = load <16 x i8>, ptr %A
+  %tmp3 = or <16 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+  ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @v_bicimm(ptr %A) nounwind {
 ; CHECK-LABEL: v_bicimm:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: bic
-	%tmp1 = load <8 x i8>, ptr %A
-	%tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
-	ret <8 x i8> %tmp3
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    bic.2s v0, #255, lsl #24
+; CHECK-NEXT:    ret
+  %tmp1 = load <8 x i8>, ptr %A
+  %tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+  ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @v_bicimmQ(ptr %A) nounwind {
 ; CHECK-LABEL: v_bicimmQ:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: bic
-	%tmp1 = load <16 x i8>, ptr %A
-	%tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
-	ret <16 x i8> %tmp3
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    bic.4s v0, #255, lsl #24
+; CHECK-NEXT:    ret
+  %tmp1 = load <16 x i8>, ptr %A
+  %tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+  ret <16 x i8> %tmp3
 }
 
 define <2 x double> @foo(<2 x double> %bar) nounwind {
-; CHECK: foo
-; CHECK: fmov.2d	v1, #1.0000000
+; CHECK-LABEL: foo:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov.2d v1, #1.00000000
+; CHECK-NEXT:    fadd.2d v0, v0, v1
+; CHECK-NEXT:    ret
   %add = fadd <2 x double> %bar, <double 1.0, double 1.0>
   ret <2 x double> %add
 }
 
 define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t1:
-; CHECK: movi.4s v0, #75
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75
+; CHECK-NEXT:    ret
+entry:
   ret <4 x i32> <i32 75, i32 75, i32 75, i32 75>
 }
 
 define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t2:
-; CHECK: movi.4s v0, #75, lsl #8
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75, lsl #8
+; CHECK-NEXT:    ret
+entry:
   ret <4 x i32> <i32 19200, i32 19200, i32 19200, i32 19200>
 }
 
 define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t3:
-; CHECK: movi.4s v0, #75, lsl #16
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75, lsl #16
+; CHECK-NEXT:    ret
+entry:
   ret <4 x i32> <i32 4915200, i32 4915200, i32 4915200, i32 4915200>
 }
 
 define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t4:
-; CHECK: movi.4s v0, #75, lsl #24
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75, lsl #24
+; CHECK-NEXT:    ret
+entry:
   ret <4 x i32> <i32 1258291200, i32 1258291200, i32 1258291200, i32 1258291200>
 }
 
 define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_8h_imm_t5:
-; CHECK: movi.8h v0, #75
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.8h v0, #75
+; CHECK-NEXT:    ret
+entry:
   ret <8 x i16> <i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75>
 }
 
 ; rdar://11989841
 define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_8h_imm_t6:
-; CHECK: movi.8h v0, #75, lsl #8
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.8h v0, #75, lsl #8
+; CHECK-NEXT:    ret
+entry:
   ret <8 x i16> <i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200>
 }
 
 define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t7:
-; CHECK: movi.4s v0, #75, msl #8
-ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75, msl #8
+; CHECK-NEXT:    ret
+entry:
+  ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
 }
 
 define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t8:
-; CHECK: movi.4s v0, #75, msl #16
-ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.4s v0, #75, msl #16
+; CHECK-NEXT:    ret
+entry:
+  ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
 }
 
 define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_16b_imm_t9:
-; CHECK: movi.16b v0, #75
-ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75,
-               i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.16b v0, #75
+; CHECK-NEXT:    ret
+entry:
+  ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
 }
 
 define <2 x i64> @movi_2d_imm_t10() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_2d_imm_t10:
-; CHECK: movi.2d v0, #0xff00ff00ff00ff
-ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi.2d v0, #0xff00ff00ff00ff
+; CHECK-NEXT:    ret
+entry:
+  ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
 }
 
 define <4 x i32> @movi_4s_imm_t11() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_4s_imm_t11:
-; CHECK: fmov.4s v0, #-0.32812500
-ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov.4s v0, #-0.32812500
+; CHECK-NEXT:    ret
+entry:
+  ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
 }
 
 define <2 x i64> @movi_2d_imm_t12() nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: movi_2d_imm_t12:
-; CHECK: fmov.2d v0, #-0.17187500
-ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov.2d v0, #-0.17187500
+; CHECK-NEXT:    ret
+entry:
+  ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
 }

From 748023dc3210533df2c1c6efc8af1b5954493701 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Sep 2024 08:59:46 +0200
Subject: [PATCH 062/114] [libc++][NFC] Replace _LIBCPP_NORETURN and
 TEST_NORETURN with [[noreturn]] (#80455)

`[[__noreturn__]]` is now always available, so we can simply use the
attribute directly instead of through a macro.
---
 libcxx/.clang-format                           |  1 -
 libcxx/include/__chrono/exception.h            |  4 ++--
 libcxx/include/__config                        |  2 --
 libcxx/include/__exception/exception_ptr.h     |  2 +-
 libcxx/include/__exception/nested_exception.h  |  8 ++++----
 libcxx/include/__exception/operations.h        |  4 ++--
 libcxx/include/__exception/terminate.h         |  2 +-
 libcxx/include/__filesystem/filesystem_error.h |  4 ++--
 libcxx/include/__format/format_error.h         |  2 +-
 .../include/__format/parser_std_format_spec.h  |  4 ++--
 libcxx/include/__functional/function.h         |  2 +-
 libcxx/include/__memory/shared_ptr.h           |  2 +-
 libcxx/include/__system_error/system_error.h   |  4 ++--
 libcxx/include/__utility/unreachable.h         |  2 +-
 libcxx/include/__verbose_abort                 |  2 +-
 libcxx/include/any                             |  2 +-
 libcxx/include/future                          |  2 +-
 libcxx/include/ios                             |  2 +-
 libcxx/include/new                             |  4 ++--
 libcxx/include/optional                        |  2 +-
 libcxx/include/regex                           |  2 +-
 libcxx/include/stdexcept                       | 18 +++++++++---------
 libcxx/include/string                          |  4 ++--
 libcxx/include/typeinfo                        |  2 +-
 libcxx/include/variant                         |  2 +-
 libcxx/include/vector                          |  8 ++++----
 libcxx/src/stdexcept.cpp                       |  2 +-
 libcxx/src/string.cpp                          |  4 ++--
 .../src/support/runtime/exception_fallback.ipp |  4 ++--
 libcxx/src/support/runtime/exception_msvc.ipp  |  4 ++--
 .../runtime/exception_pointer_cxxabi.ipp       |  4 ++--
 .../runtime/exception_pointer_glibcxx.ipp      |  6 +++---
 .../support/runtime/exception_pointer_msvc.ipp |  4 ++--
 .../exception_pointer_unimplemented.ipp        |  4 ++--
 libcxx/src/vector.cpp                          |  4 ++--
 libcxx/test/support/assert_macros.h            |  2 +-
 libcxx/test/support/check_assertion.h          |  4 ++--
 libcxx/test/support/count_new.h                |  7 +++----
 libcxx/test/support/test_macros.h              |  6 ------
 39 files changed, 69 insertions(+), 79 deletions(-)

diff --git a/libcxx/.clang-format b/libcxx/.clang-format
index b2ca452931fec..c37b234e857de 100644
--- a/libcxx/.clang-format
+++ b/libcxx/.clang-format
@@ -44,7 +44,6 @@ AttributeMacros: [
                   '_LIBCPP_NO_UNIQUE_ADDRESS',
                   '_LIBCPP_NOALIAS',
                   '_LIBCPP_NODISCARD',
-                  '_LIBCPP_NORETURN',
                   '_LIBCPP_OVERRIDABLE_FUNC_VIS',
                   '_LIBCPP_STANDALONE_DEBUG',
                   '_LIBCPP_TEMPLATE_DATA_VIS',
diff --git a/libcxx/include/__chrono/exception.h b/libcxx/include/__chrono/exception.h
index 266f8fac44176..cc408d78a36da 100644
--- a/libcxx/include/__chrono/exception.h
+++ b/libcxx/include/__chrono/exception.h
@@ -71,7 +71,7 @@ class nonexistent_local_time : public runtime_error {
 };
 
 template <class _Duration>
-_LIBCPP_NORETURN _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_nonexistent_local_time(
+[[noreturn]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_nonexistent_local_time(
     [[maybe_unused]] const local_time<_Duration>& __time, [[maybe_unused]] const local_info& __info) {
 #    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw nonexistent_local_time(__time, __info);
@@ -115,7 +115,7 @@ class ambiguous_local_time : public runtime_error {
 };
 
 template <class _Duration>
-_LIBCPP_NORETURN _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_ambiguous_local_time(
+[[noreturn]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_ambiguous_local_time(
     [[maybe_unused]] const local_time<_Duration>& __time, [[maybe_unused]] const local_info& __info) {
 #    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw ambiguous_local_time(__time, __info);
diff --git a/libcxx/include/__config b/libcxx/include/__config
index bccf90d1dbacd..b0a5dda147a6a 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -312,7 +312,6 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ALIGNOF(_Tp) alignof(_Tp)
 #    define _ALIGNAS_TYPE(x) alignas(x)
 #    define _ALIGNAS(x) alignas(x)
-#    define _LIBCPP_NORETURN [[noreturn]]
 #    define _NOEXCEPT noexcept
 #    define _NOEXCEPT_(...) noexcept(__VA_ARGS__)
 #    define _LIBCPP_CONSTEXPR constexpr
@@ -322,7 +321,6 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ALIGNOF(_Tp) _Alignof(_Tp)
 #    define _ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCPP_ALIGNOF(x))))
 #    define _ALIGNAS(x) __attribute__((__aligned__(x)))
-#    define _LIBCPP_NORETURN __attribute__((__noreturn__))
 #    define _LIBCPP_HAS_NO_NOEXCEPT
 #    define nullptr __nullptr
 #    define _NOEXCEPT throw()
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index beadd9212abd1..9e5351f534a1c 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -159,7 +159,7 @@ _LIBCPP_EXPORTED_FROM_ABI void swap(exception_ptr&, exception_ptr&) _NOEXCEPT;
 
 _LIBCPP_EXPORTED_FROM_ABI exception_ptr __copy_exception_ptr(void* __except, const void* __ptr);
 _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT;
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr);
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr);
 
 // This is a built-in template function which automagically extracts the required
 // information.
diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h
index 4c7970d167ffa..8e817e1c06978 100644
--- a/libcxx/include/__exception/nested_exception.h
+++ b/libcxx/include/__exception/nested_exception.h
@@ -40,7 +40,7 @@ class _LIBCPP_EXPORTED_FROM_ABI nested_exception {
   virtual ~nested_exception() _NOEXCEPT;
 
   // access functions
-  _LIBCPP_NORETURN void rethrow_nested() const;
+  [[__noreturn__]] void rethrow_nested() const;
   _LIBCPP_HIDE_FROM_ABI exception_ptr nested_ptr() const _NOEXCEPT { return __ptr_; }
 };
 
@@ -55,19 +55,19 @@ struct __throw_with_nested;
 
 template <class _Tp, class _Up>
 struct __throw_with_nested<_Tp, _Up, true> {
-  _LIBCPP_NORETURN static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) {
+  [[__noreturn__]] static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) {
     throw __nested<_Up>(std::forward<_Tp>(__t));
   }
 };
 
 template <class _Tp, class _Up>
 struct __throw_with_nested<_Tp, _Up, false> {
-  _LIBCPP_NORETURN static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) { throw std::forward<_Tp>(__t); }
+  [[__noreturn__]] static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) { throw std::forward<_Tp>(__t); }
 };
 #endif
 
 template <class _Tp>
-_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void throw_with_nested(_Tp&& __t) {
+[[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void throw_with_nested(_Tp&& __t) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   using _Up = __decay_t<_Tp>;
   static_assert(is_copy_constructible<_Up>::value, "type thrown must be CopyConstructible");
diff --git a/libcxx/include/__exception/operations.h b/libcxx/include/__exception/operations.h
index 4a0a697c00e6e..c8744eb297a4e 100644
--- a/libcxx/include/__exception/operations.h
+++ b/libcxx/include/__exception/operations.h
@@ -22,7 +22,7 @@ namespace std { // purposefully not using versioning namespace
 using unexpected_handler = void (*)();
 _LIBCPP_EXPORTED_FROM_ABI unexpected_handler set_unexpected(unexpected_handler) _NOEXCEPT;
 _LIBCPP_EXPORTED_FROM_ABI unexpected_handler get_unexpected() _NOEXCEPT;
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void unexpected();
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void unexpected();
 #endif
 
 using terminate_handler = void (*)();
@@ -37,7 +37,7 @@ _LIBCPP_EXPORTED_FROM_ABI int uncaught_exceptions() _NOEXCEPT;
 class _LIBCPP_EXPORTED_FROM_ABI exception_ptr;
 
 _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT;
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr);
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr);
 } // namespace std
 
 #endif // _LIBCPP___EXCEPTION_OPERATIONS_H
diff --git a/libcxx/include/__exception/terminate.h b/libcxx/include/__exception/terminate.h
index e672471dc5263..0bfc3506d3791 100644
--- a/libcxx/include/__exception/terminate.h
+++ b/libcxx/include/__exception/terminate.h
@@ -16,7 +16,7 @@
 #endif
 
 namespace std { // purposefully not using versioning namespace
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void terminate() _NOEXCEPT;
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void terminate() _NOEXCEPT;
 } // namespace std
 
 #endif // _LIBCPP___EXCEPTION_TERMINATE_H
diff --git a/libcxx/include/__filesystem/filesystem_error.h b/libcxx/include/__filesystem/filesystem_error.h
index 80a11e3b1932c..f43568c2004d2 100644
--- a/libcxx/include/__filesystem/filesystem_error.h
+++ b/libcxx/include/__filesystem/filesystem_error.h
@@ -69,13 +69,13 @@ class _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY _LIBCPP_EXPORTED_FROM_ABI filesyst
 
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
 template <class... _Args>
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void
 __throw_filesystem_error(_Args&&... __args) {
   throw filesystem_error(std::forward<_Args>(__args)...);
 }
 #  else
 template <class... _Args>
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void
 __throw_filesystem_error(_Args&&...) {
   _LIBCPP_VERBOSE_ABORT("filesystem_error was thrown in -fno-exceptions mode");
 }
diff --git a/libcxx/include/__format/format_error.h b/libcxx/include/__format/format_error.h
index 35a39ee82f3da..1df7dbff2b7df 100644
--- a/libcxx/include/__format/format_error.h
+++ b/libcxx/include/__format/format_error.h
@@ -35,7 +35,7 @@ class _LIBCPP_EXPORTED_FROM_ABI format_error : public runtime_error {
 };
 _LIBCPP_DIAGNOSTIC_POP
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_format_error(const char* __s) {
+[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI void __throw_format_error(const char* __s) {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw format_error(__s);
 #  else
diff --git a/libcxx/include/__format/parser_std_format_spec.h b/libcxx/include/__format/parser_std_format_spec.h
index 28891e5d2876c..6bdf8e319ba44 100644
--- a/libcxx/include/__format/parser_std_format_spec.h
+++ b/libcxx/include/__format/parser_std_format_spec.h
@@ -52,13 +52,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace __format_spec {
 
-_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void
+[[noreturn]] _LIBCPP_HIDE_FROM_ABI inline void
 __throw_invalid_option_format_error(const char* __id, const char* __option) {
   std::__throw_format_error(
       (string("The format specifier for ") + __id + " does not allow the " + __option + " option").c_str());
 }
 
-_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void __throw_invalid_type_format_error(const char* __id) {
+[[noreturn]] _LIBCPP_HIDE_FROM_ABI inline void __throw_invalid_type_format_error(const char* __id) {
   std::__throw_format_error(
       (string("The type option contains an invalid value for ") + __id + " formatting argument").c_str());
 }
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index c7b98035e34bf..ff31011caa329 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -78,7 +78,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_function_call : public exception {
 };
 _LIBCPP_DIAGNOSTIC_POP
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_function_call() {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_function_call() {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_function_call();
 #  else
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 4dd8022822d22..5dcd475e2c9f9 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -123,7 +123,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_weak_ptr : public std::exception {
   const char* what() const _NOEXCEPT override;
 };
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_weak_ptr() {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_weak_ptr() {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_weak_ptr();
 #else
diff --git a/libcxx/include/__system_error/system_error.h b/libcxx/include/__system_error/system_error.h
index 362e67505658c..3ffa1029ca5c2 100644
--- a/libcxx/include/__system_error/system_error.h
+++ b/libcxx/include/__system_error/system_error.h
@@ -39,8 +39,8 @@ class _LIBCPP_EXPORTED_FROM_ABI system_error : public runtime_error {
   _LIBCPP_HIDE_FROM_ABI const error_code& code() const _NOEXCEPT { return __ec_; }
 };
 
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_system_error(int __ev, const char* __what_arg);
-_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void __throw_system_error(error_code __ec, const char* __what_arg) {
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_system_error(int __ev, const char* __what_arg);
+[[__noreturn__]] _LIBCPP_HIDE_FROM_ABI inline void __throw_system_error(error_code __ec, const char* __what_arg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw system_error(__ec, __what_arg);
 #else
diff --git a/libcxx/include/__utility/unreachable.h b/libcxx/include/__utility/unreachable.h
index d833f74c2e4f1..5525452aa55ef 100644
--- a/libcxx/include/__utility/unreachable.h
+++ b/libcxx/include/__utility/unreachable.h
@@ -18,7 +18,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void __libcpp_unreachable() {
+[[__noreturn__]] _LIBCPP_HIDE_FROM_ABI inline void __libcpp_unreachable() {
   _LIBCPP_ASSERT_INTERNAL(false, "std::unreachable() was reached");
   __builtin_unreachable();
 }
diff --git a/libcxx/include/__verbose_abort b/libcxx/include/__verbose_abort
index 195ce65b721ff..244278aec652d 100644
--- a/libcxx/include/__verbose_abort
+++ b/libcxx/include/__verbose_abort
@@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // This function should never be called directly from the code -- it should only be called through
 // the _LIBCPP_VERBOSE_ABORT macro.
-_LIBCPP_NORETURN _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS
+[[__noreturn__]] _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS
 _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...);
 
 // _LIBCPP_VERBOSE_ABORT(format, args...)
diff --git a/libcxx/include/any b/libcxx/include/any
index 7630e8a057d05..6e4ff31ff9b62 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -127,7 +127,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 17
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST void __throw_bad_any_cast() {
+[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST void __throw_bad_any_cast() {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_any_cast();
 #  else
diff --git a/libcxx/include/future b/libcxx/include/future
index 01c0b10172cd3..9a0eb7971a313 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -472,7 +472,7 @@ inline _LIBCPP_HIDE_FROM_ABI error_condition make_error_condition(future_errc __
   return error_condition(static_cast<int>(__e), future_category());
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_future_error(future_errc __ev);
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_future_error(future_errc __ev);
 
 class _LIBCPP_EXPORTED_FROM_ABI future_error : public logic_error {
   error_code __ec_;
diff --git a/libcxx/include/ios b/libcxx/include/ios
index 426838b91e5dc..61a05fadd29a1 100644
--- a/libcxx/include/ios
+++ b/libcxx/include/ios
@@ -440,7 +440,7 @@ public:
   ~failure() _NOEXCEPT override;
 };
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_failure(char const* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_failure(char const* __msg) {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw ios_base::failure(__msg);
 #  else
diff --git a/libcxx/include/new b/libcxx/include/new
index 9015c4e712763..207e4b46e0ca6 100644
--- a/libcxx/include/new
+++ b/libcxx/include/new
@@ -166,9 +166,9 @@ public:
 };
 #endif // defined(_LIBCPP_ABI_VCRUNTIME) && defined(_HAS_EXCEPTIONS) && _HAS_EXCEPTIONS == 0
 
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_bad_alloc(); // not in C++ spec
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_bad_alloc(); // not in C++ spec
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_array_new_length() {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_array_new_length() {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_array_new_length();
 #else
diff --git a/libcxx/include/optional b/libcxx/include/optional
index 41d7515a2b689..b0933b59b25d2 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -255,7 +255,7 @@ public:
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS void
+[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS void
 __throw_bad_optional_access() {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_optional_access();
diff --git a/libcxx/include/regex b/libcxx/include/regex
index 08aebc2266f5d..d59abb8daf8ec 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -983,7 +983,7 @@ public:
 };
 
 template <regex_constants::error_type _Ev>
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_regex_error() {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_regex_error() {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw regex_error(_Ev);
 #else
diff --git a/libcxx/include/stdexcept b/libcxx/include/stdexcept
index 853c185187c77..bdfc27aeac374 100644
--- a/libcxx/include/stdexcept
+++ b/libcxx/include/stdexcept
@@ -209,9 +209,9 @@ public:
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // in the dylib
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_runtime_error(const char*);
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_runtime_error(const char*);
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_logic_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_logic_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw logic_error(__msg);
 #else
@@ -219,7 +219,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_logic_error(const cha
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_domain_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_domain_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw domain_error(__msg);
 #else
@@ -227,7 +227,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_domain_error(const ch
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_invalid_argument(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_invalid_argument(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw invalid_argument(__msg);
 #else
@@ -235,7 +235,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_invalid_argument(cons
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_length_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_length_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw length_error(__msg);
 #else
@@ -243,7 +243,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_length_error(const ch
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw out_of_range(__msg);
 #else
@@ -251,7 +251,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range(const ch
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_range_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_range_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw range_error(__msg);
 #else
@@ -259,7 +259,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_range_error(const cha
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_overflow_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_overflow_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw overflow_error(__msg);
 #else
@@ -267,7 +267,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_overflow_error(const
 #endif
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_underflow_error(const char* __msg) {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_underflow_error(const char* __msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw underflow_error(__msg);
 #else
diff --git a/libcxx/include/string b/libcxx/include/string
index 3480b57375c11..46c5a5ac6de60 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -2230,11 +2230,11 @@ private:
     return std::__is_pointer_in_range(data(), data() + size() + 1, std::addressof(__v));
   }
 
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const {
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const {
     std::__throw_length_error("basic_string");
   }
 
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const {
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const {
     std::__throw_out_of_range("basic_string");
   }
 
diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index 54e0b4cf5d634..a44fa4d73ee58 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -373,7 +373,7 @@ private:
 #endif // defined(_LIBCPP_ABI_VCRUNTIME) && _HAS_EXCEPTIONS == 0
 
 _LIBCPP_BEGIN_NAMESPACE_STD
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_cast() {
+[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_cast() {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_cast();
 #else
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 1367cd66f3701..1cac603c27c24 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -298,7 +298,7 @@ struct __farray {
   _LIBCPP_HIDE_FROM_ABI constexpr const _Tp& operator[](size_t __n) const noexcept { return __buf_[__n]; }
 };
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS void
+[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS void
 __throw_bad_variant_access() {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw bad_variant_access();
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 2442852c764a6..fc0a48669fe53 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -995,9 +995,9 @@ private:
     __move_assign_alloc(__c, integral_constant<bool, __alloc_traits::propagate_on_container_move_assignment::value>());
   }
 
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); }
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); }
 
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); }
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector& __c, true_type) {
     if (__alloc() != __c.__alloc()) {
@@ -2163,9 +2163,9 @@ public:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __invariants() const;
 
 private:
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); }
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); }
 
-  _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); }
+  [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); }
 
   template <class _InputIterator, class _Sentinel>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
diff --git a/libcxx/src/stdexcept.cpp b/libcxx/src/stdexcept.cpp
index bc25c0f9e6ef6..134d28efb750f 100644
--- a/libcxx/src/stdexcept.cpp
+++ b/libcxx/src/stdexcept.cpp
@@ -19,7 +19,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-_LIBCPP_NORETURN void __throw_runtime_error(const char* msg) {
+void __throw_runtime_error(const char* msg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   throw runtime_error(msg);
 #else
diff --git a/libcxx/src/string.cpp b/libcxx/src/string.cpp
index cf07b3ef1ef27..12db5381a7b1b 100644
--- a/libcxx/src/string.cpp
+++ b/libcxx/src/string.cpp
@@ -28,8 +28,8 @@ struct __basic_string_common;
 // The struct isn't declared anymore in the headers. It's only here for ABI compatibility.
 template <>
 struct __basic_string_common<true> {
-  _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const;
-  _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const;
+  [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const;
+  [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const;
 };
 
 void __basic_string_common<true>::__throw_length_error() const { std::__throw_length_error("basic_string"); }
diff --git a/libcxx/src/support/runtime/exception_fallback.ipp b/libcxx/src/support/runtime/exception_fallback.ipp
index 18ff4b83d8765..ca542c9497214 100644
--- a/libcxx/src/support/runtime/exception_fallback.ipp
+++ b/libcxx/src/support/runtime/exception_fallback.ipp
@@ -21,7 +21,7 @@ unexpected_handler set_unexpected(unexpected_handler func) noexcept {
 
 unexpected_handler get_unexpected() noexcept { return __libcpp_atomic_load(&__unexpected_handler); }
 
-_LIBCPP_NORETURN void unexpected() {
+[[noreturn]] void unexpected() {
   (*get_unexpected())();
   // unexpected handler should not return
   terminate();
@@ -33,7 +33,7 @@ terminate_handler set_terminate(terminate_handler func) noexcept {
 
 terminate_handler get_terminate() noexcept { return __libcpp_atomic_load(&__terminate_handler); }
 
-_LIBCPP_NORETURN void terminate() noexcept {
+[[noreturn]] void terminate() noexcept {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
diff --git a/libcxx/src/support/runtime/exception_msvc.ipp b/libcxx/src/support/runtime/exception_msvc.ipp
index 323cd9d180057..163aec057d9b5 100644
--- a/libcxx/src/support/runtime/exception_msvc.ipp
+++ b/libcxx/src/support/runtime/exception_msvc.ipp
@@ -32,7 +32,7 @@ unexpected_handler set_unexpected(unexpected_handler func) noexcept { return ::s
 
 unexpected_handler get_unexpected() noexcept { return ::_get_unexpected(); }
 
-_LIBCPP_NORETURN void unexpected() {
+[[noreturn]] void unexpected() {
   (*get_unexpected())();
   // unexpected handler should not return
   terminate();
@@ -42,7 +42,7 @@ terminate_handler set_terminate(terminate_handler func) noexcept { return ::set_
 
 terminate_handler get_terminate() noexcept { return ::_get_terminate(); }
 
-_LIBCPP_NORETURN void terminate() noexcept {
+[[noreturn]] void terminate() noexcept {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
diff --git a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
index bdb17b9996b7e..8f5c2060bb06c 100644
--- a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
@@ -40,7 +40,7 @@ nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {}
 
 nested_exception::~nested_exception() noexcept {}
 
-_LIBCPP_NORETURN void nested_exception::rethrow_nested() const {
+void nested_exception::rethrow_nested() const {
   if (__ptr_ == nullptr)
     terminate();
   rethrow_exception(__ptr_);
@@ -55,7 +55,7 @@ exception_ptr current_exception() noexcept {
   return ptr;
 }
 
-_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) {
+void rethrow_exception(exception_ptr p) {
   __cxa_rethrow_primary_exception(p.__ptr_);
   // if p.__ptr_ is NULL, above returns so we terminate
   terminate();
diff --git a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
index 6dad248f9e1fd..174b44ce0e6f7 100644
--- a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
@@ -31,7 +31,7 @@ struct exception_ptr {
 
 } // namespace __exception_ptr
 
-_LIBCPP_NORETURN void rethrow_exception(__exception_ptr::exception_ptr);
+[[noreturn]] void rethrow_exception(__exception_ptr::exception_ptr);
 
 exception_ptr::~exception_ptr() noexcept { reinterpret_cast<__exception_ptr::exception_ptr*>(this)->~exception_ptr(); }
 
@@ -55,13 +55,13 @@ exception_ptr exception_ptr::__from_native_exception_pointer(void* __e) noexcept
 
 nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {}
 
-_LIBCPP_NORETURN void nested_exception::rethrow_nested() const {
+[[noreturn]] void nested_exception::rethrow_nested() const {
   if (__ptr_ == nullptr)
     terminate();
   rethrow_exception(__ptr_);
 }
 
-_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) {
+[[noreturn]] void rethrow_exception(exception_ptr p) {
   rethrow_exception(reinterpret_cast<__exception_ptr::exception_ptr&>(p));
 }
 
diff --git a/libcxx/src/support/runtime/exception_pointer_msvc.ipp b/libcxx/src/support/runtime/exception_pointer_msvc.ipp
index b87742b32ded6..2be5136176e32 100644
--- a/libcxx/src/support/runtime/exception_pointer_msvc.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_msvc.ipp
@@ -61,13 +61,13 @@ exception_ptr current_exception() noexcept {
   return __ret;
 }
 
-_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) { __ExceptionPtrRethrow(&p); }
+[[noreturn]] void rethrow_exception(exception_ptr p) { __ExceptionPtrRethrow(&p); }
 
 nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {}
 
 nested_exception::~nested_exception() noexcept {}
 
-_LIBCPP_NORETURN void nested_exception::rethrow_nested() const {
+[[noreturn]] void nested_exception::rethrow_nested() const {
   if (__ptr_ == nullptr)
     terminate();
   rethrow_exception(__ptr_);
diff --git a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
index e12b0caf419d2..1fe3127f18b0b 100644
--- a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
@@ -44,7 +44,7 @@ nested_exception::~nested_exception() noexcept {}
 
 #endif
 
-_LIBCPP_NORETURN void nested_exception::rethrow_nested() const {
+[[noreturn]] void nested_exception::rethrow_nested() const {
 #warning exception_ptr not yet implemented
   fprintf(stderr, "exception_ptr not yet implemented\n");
   ::abort();
@@ -61,7 +61,7 @@ exception_ptr current_exception() noexcept {
   ::abort();
 }
 
-_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) {
+[[noreturn]] void rethrow_exception(exception_ptr p) {
 #warning exception_ptr not yet implemented
   fprintf(stderr, "exception_ptr not yet implemented\n");
   ::abort();
diff --git a/libcxx/src/vector.cpp b/libcxx/src/vector.cpp
index b6153b0e9bf99..3f3a906d6421f 100644
--- a/libcxx/src/vector.cpp
+++ b/libcxx/src/vector.cpp
@@ -17,8 +17,8 @@ struct __vector_base_common;
 
 template <>
 struct __vector_base_common<true> {
-  _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const;
-  _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const;
+  [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const;
+  [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const;
 };
 
 void __vector_base_common<true>::__throw_length_error() const { std::__throw_length_error("vector"); }
diff --git a/libcxx/test/support/assert_macros.h b/libcxx/test/support/assert_macros.h
index 1059823dcb246..b7011794025bf 100644
--- a/libcxx/test/support/assert_macros.h
+++ b/libcxx/test/support/assert_macros.h
@@ -50,7 +50,7 @@ void test_log(const char* condition, const char* file, int line, const F& functo
 }
 
 template <class Arg>
-TEST_NORETURN void test_fail(const char* file, int line, const Arg& arg) {
+[[noreturn]] void test_fail(const char* file, int line, const Arg& arg) {
   test_log("", file, line, arg);
   std::abort();
 }
diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index 329ce819a6c8d..47ebfeeeefc0f 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -142,7 +142,7 @@ std::string ToString(std::array<DeathCause, N> const& causes) {
   return ss.str();
 }
 
-TEST_NORETURN void StopChildProcess(DeathCause cause) { std::exit(static_cast<int>(cause)); }
+[[noreturn]] void StopChildProcess(DeathCause cause) { std::exit(static_cast<int>(cause)); }
 
 DeathCause ConvertToDeathCause(int val) {
   if (val < static_cast<int>(DeathCause::VerboseAbort) || val > static_cast<int>(DeathCause::Unknown)) {
@@ -260,7 +260,7 @@ class DeathTest {
   }
 
   template <class Func>
-  TEST_NORETURN void RunForChild(Func&& f) {
+  [[noreturn]] void RunForChild(Func&& f) {
     close(GetStdOutReadFD()); // don't need to read from the pipe in the child.
     close(GetStdErrReadFD());
     auto DupFD = [](int DestFD, int TargetFD) {
diff --git a/libcxx/test/support/count_new.h b/libcxx/test/support/count_new.h
index 61c8ca16ab0d0..c8169d3acceab 100644
--- a/libcxx/test/support/count_new.h
+++ b/libcxx/test/support/count_new.h
@@ -24,14 +24,13 @@
 
 namespace detail
 {
-   TEST_NORETURN
-   inline void throw_bad_alloc_helper() {
+[[noreturn]] inline void throw_bad_alloc_helper() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-       throw std::bad_alloc();
+  throw std::bad_alloc();
 #else
        std::abort();
 #endif
-   }
+}
 }
 
 class MemCounter
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 5d4c1a65cfafb..3aa818af1d269 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -214,12 +214,6 @@
 #define TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT
 #endif
 
-#if defined(_LIBCPP_NORETURN)
-#define TEST_NORETURN _LIBCPP_NORETURN
-#else
-#define TEST_NORETURN [[noreturn]]
-#endif
-
 #if defined(_LIBCPP_HAS_NO_ALIGNED_ALLOCATION) || \
   (!(TEST_STD_VER > 14 || \
     (defined(__cpp_aligned_new) && __cpp_aligned_new >= 201606L)))

From 1e3a24d2e4eb63c17b962161ae6588d1b2c178f8 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 11 Sep 2024 09:36:20 +0200
Subject: [PATCH 063/114] [InitUndef] Don't use largest super class (#107885)

The InitUndef pass currently uses the getLargestSuperClass() hook (which
is only used by that pass) to chose the register to initialize. This was done
to reduce the number of undef init pseudos needed, e.g. so that the vrnov0
regclass would use the same pseudo as v0. After #106744 we use a single
generic pseudo, so this is no longer necessary.
---
 .../include/llvm/CodeGen/TargetRegisterInfo.h |  9 ---
 llvm/lib/CodeGen/InitUndef.cpp                | 10 ++--
 llvm/lib/Target/ARM/ARMBaseRegisterInfo.h     | 13 -----
 llvm/lib/Target/RISCV/RISCVRegisterInfo.h     | 13 -----
 .../rvv/subregister-undef-early-clobber.mir   | 58 +++++++++----------
 5 files changed, 33 insertions(+), 70 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 197f66e8659d5..ebf06bc57948f 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1204,15 +1204,6 @@ class TargetRegisterInfo : public MCRegisterInfo {
     return false;
   }
 
-  /// Returns the Largest Super Class that is being initialized. There
-  /// should be a Pseudo Instruction implemented for the super class
-  /// that is being returned to ensure that Init Undef can apply the
-  /// initialization correctly.
-  virtual const TargetRegisterClass *
-  getLargestSuperClass(const TargetRegisterClass *RC) const {
-    llvm_unreachable("Unexpected target register class.");
-  }
-
   /// Returns if the architecture being targeted has the required Pseudo
   /// Instructions for initializing the register. By default this returns false,
   /// but where it is overriden for an architecture, the behaviour will be
diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp
index 8d20f2668de6b..1613e413712d2 100644
--- a/llvm/lib/CodeGen/InitUndef.cpp
+++ b/llvm/lib/CodeGen/InitUndef.cpp
@@ -152,8 +152,7 @@ bool InitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
     if (Info.UsedLanes == Info.DefinedLanes)
       continue;
 
-    const TargetRegisterClass *TargetRegClass =
-        TRI->getLargestSuperClass(MRI->getRegClass(Reg));
+    const TargetRegisterClass *TargetRegClass = MRI->getRegClass(Reg);
 
     LaneBitmask NeedDef = Info.UsedLanes & ~Info.DefinedLanes;
 
@@ -172,8 +171,8 @@ bool InitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
     Register LatestReg = Reg;
     for (auto ind : SubRegIndexNeedInsert) {
       Changed = true;
-      const TargetRegisterClass *SubRegClass = TRI->getLargestSuperClass(
-          TRI->getSubRegisterClass(TargetRegClass, ind));
+      const TargetRegisterClass *SubRegClass =
+          TRI->getSubRegisterClass(TargetRegClass, ind);
       Register TmpInitSubReg = MRI->createVirtualRegister(SubRegClass);
       LLVM_DEBUG(dbgs() << "Register Class ID" << SubRegClass->getID() << "\n");
       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(),
@@ -199,8 +198,7 @@ bool InitUndef::fixupIllOperand(MachineInstr *MI, MachineOperand &MO) {
       dbgs() << "Emitting PseudoInitUndef Instruction for implicit register "
              << printReg(MO.getReg()) << '\n');
 
-  const TargetRegisterClass *TargetRegClass =
-      TRI->getLargestSuperClass(MRI->getRegClass(MO.getReg()));
+  const TargetRegisterClass *TargetRegClass = MRI->getRegClass(MO.getReg());
   LLVM_DEBUG(dbgs() << "Register Class ID" << TargetRegClass->getID() << "\n");
   Register NewReg = MRI->createVirtualRegister(TargetRegClass);
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 53803cff8b90a..58b5e98fd30b1 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -241,19 +241,6 @@ class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
 
   int getSEHRegNum(unsigned i) const { return getEncodingValue(i); }
 
-  const TargetRegisterClass *
-  getLargestSuperClass(const TargetRegisterClass *RC) const override {
-    if (ARM::MQPRRegClass.hasSubClassEq(RC))
-      return &ARM::MQPRRegClass;
-    if (ARM::SPRRegClass.hasSubClassEq(RC))
-      return &ARM::SPRRegClass;
-    if (ARM::DPR_VFP2RegClass.hasSubClassEq(RC))
-      return &ARM::DPR_VFP2RegClass;
-    if (ARM::GPRRegClass.hasSubClassEq(RC))
-      return &ARM::GPRRegClass;
-    return RC;
-  }
-
   bool doesRegClassHavePseudoInitUndef(
       const TargetRegisterClass *RC) const override {
     (void)RC;
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 7e04e9154b524..98a712af08539 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -130,19 +130,6 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
                              const MachineFunction &MF, const VirtRegMap *VRM,
                              const LiveRegMatrix *Matrix) const override;
 
-  const TargetRegisterClass *
-  getLargestSuperClass(const TargetRegisterClass *RC) const override {
-    if (RISCV::VRM8RegClass.hasSubClassEq(RC))
-      return &RISCV::VRM8RegClass;
-    if (RISCV::VRM4RegClass.hasSubClassEq(RC))
-      return &RISCV::VRM4RegClass;
-    if (RISCV::VRM2RegClass.hasSubClassEq(RC))
-      return &RISCV::VRM2RegClass;
-    if (RISCV::VRRegClass.hasSubClassEq(RC))
-      return &RISCV::VRRegClass;
-    return RC;
-  }
-
   bool doesRegClassHavePseudoInitUndef(
       const TargetRegisterClass *RC) const override {
     return isVRRegClass(RC);
diff --git a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
index be6ed4d2a6aa1..ed274cf49fa9b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
@@ -14,9 +14,9 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -52,7 +52,7 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_0
@@ -92,7 +92,7 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_3
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -130,7 +130,7 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_2
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -166,7 +166,7 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -239,11 +239,11 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -279,9 +279,9 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_0
@@ -319,11 +319,11 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_2
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
     ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_3
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -359,11 +359,11 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_3
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
     ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_2
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -401,9 +401,9 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_5
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -441,9 +441,9 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_4
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -481,9 +481,9 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_7
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -521,9 +521,9 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2
-    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_6
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -559,9 +559,9 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -597,7 +597,7 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
     ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0
@@ -637,7 +637,7 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -675,7 +675,7 @@ body:             |
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
     ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
@@ -711,7 +711,7 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M4_]], %subreg.sub_vrm4_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF
     ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0

From 19f604edfc015b35999b3b95e94f18389e4b392d Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 11 Sep 2024 08:54:45 +0100
Subject: [PATCH 064/114] [lldb][test] Add test for printing std::string
 through expression evaluator

This would've caught the failures in
https://github.com/llvm/llvm-project/pull/105865 in the libc++
data-formatter CI.
---
 .../libcxx/string/TestDataFormatterLibcxxString.py     | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
index 98438742a11ca..6b5bcf8a7df2f 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
@@ -100,6 +100,16 @@ def cleanup():
             "s", result_type=ns + "::wstring", result_summary='L"hello world! מזל טוב!"'
         )
 
+        self.expect_expr(
+            "q", result_type=ns + "::string", result_summary='"hello world"'
+        )
+
+        self.expect_expr(
+            "Q",
+            result_type=ns + "::string",
+            result_summary='"quite a long std::strin with lots of info inside it"',
+        )
+
         self.expect(
             "frame variable",
             substrs=[

From 2afe678f0a246387977a8ca694d4489e2c868991 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 11 Sep 2024 10:04:37 +0200
Subject: [PATCH 065/114] [MemCpyOpt] Allow memcpy elision for non-noalias
 arguments (#107860)

We currently elide memcpys for readonly nocapture noalias arguments.
noalias is checked to make sure that there are no other ways to write
the memory, e.g. through a different argument or an escaped pointer.

In addition to the current noalias check, also query alias analysis, in
case it can prove that modification is not possible through other means.

This fixes the problem reported in
https://discourse.llvm.org/t/problem-about-memcpy-elimination/81121.
---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 19 ++++++++++++++-----
 llvm/test/Transforms/MemCpyOpt/memcpy.ll      |  7 ++-----
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 3f15fa2163d27..d81665622809c 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1950,7 +1950,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
 /// during call. Try to use memcpy source directly if all of the following
 /// conditions are satisfied.
 /// 1. The memcpy dst is neither modified during the call nor captured by the
-/// call. (if readonly, noalias, nocapture attributes on call-site.)
+/// call.
 /// 2. The memcpy dst is an alloca with known alignment & size.
 ///     2-1. The memcpy length == the alloca size which ensures that the new
 ///     pointer is dereferenceable for the required range
@@ -1961,12 +1961,22 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
 /// 4. The memcpy src is not modified during the call. (ModRef check shows no
 /// Mod.)
 bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
+  BatchAAResults BAA(*AA);
+  Value *ImmutArg = CB.getArgOperand(ArgNo);
+
   // 1. Ensure passed argument is immutable during call.
-  if (!(CB.paramHasAttr(ArgNo, Attribute::NoAlias) &&
-        CB.paramHasAttr(ArgNo, Attribute::NoCapture)))
+  if (!CB.paramHasAttr(ArgNo, Attribute::NoCapture))
+    return false;
+
+  // We know that the argument is readonly at this point, but the function
+  // might still modify the same memory through a different pointer. Exclude
+  // this either via noalias, or alias analysis.
+  if (!CB.paramHasAttr(ArgNo, Attribute::NoAlias) &&
+      isModSet(
+          BAA.getModRefInfo(&CB, MemoryLocation::getBeforeOrAfter(ImmutArg))))
     return false;
+
   const DataLayout &DL = CB.getDataLayout();
-  Value *ImmutArg = CB.getArgOperand(ArgNo);
 
   // 2. Check that arg is alloca
   // TODO: Even if the arg gets back to branches, we can remove memcpy if all
@@ -1986,7 +1996,6 @@ bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
     return false;
 
   MemCpyInst *MDep = nullptr;
-  BatchAAResults BAA(*AA);
   MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
       CallAccess->getDefiningAccess(), Loc, BAA);
   if (auto *MD = dyn_cast<MemoryDef>(Clobber))
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index a28b0542a7c59..ba260752ce4b5 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -472,9 +472,7 @@ define void @immut_param_mayalias(ptr align 4 noalias %val) {
 ; argument doesn't matter.
 define void @immut_param_unescaped_alloca(ptr align 4 noalias %val) {
 ; CHECK-LABEL: @immut_param_unescaped_alloca(
-; CHECK-NEXT:    [[VAL1:%.*]] = alloca i8, align 4
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VAL1]], ptr align 4 [[VAL:%.*]], i64 1, i1 false)
-; CHECK-NEXT:    call void @f(ptr nocapture readonly align 4 [[VAL1]])
+; CHECK-NEXT:    call void @f(ptr nocapture readonly align 4 [[VAL:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %val1 = alloca i8, align 4
@@ -489,8 +487,7 @@ define void @immut_param_memory_argmem_read(ptr align 4 noalias %val) {
 ; CHECK-LABEL: @immut_param_memory_argmem_read(
 ; CHECK-NEXT:    [[VAL1:%.*]] = alloca i8, align 4
 ; CHECK-NEXT:    call void @f(ptr [[VAL1]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VAL1]], ptr align 4 [[VAL:%.*]], i64 1, i1 false)
-; CHECK-NEXT:    call void @f(ptr nocapture readonly align 4 [[VAL1]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    call void @f(ptr nocapture readonly align 4 [[VAL:%.*]]) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   %val1 = alloca i8, align 4

From 34cab2ed82a63ecf3d0ebf790def6d21bd4b87af Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Wed, 11 Sep 2024 16:10:38 +0800
Subject: [PATCH 066/114] [Driver][test] Remove useless LoongArch test checks
 in mcmodel.c

---
 clang/test/Driver/mcmodel.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/test/Driver/mcmodel.c b/clang/test/Driver/mcmodel.c
index 9681c32579d71..c6c8b5433d23b 100644
--- a/clang/test/Driver/mcmodel.c
+++ b/clang/test/Driver/mcmodel.c
@@ -43,5 +43,4 @@
 // AARCH64-PIC-LARGE: error: invalid argument '-mcmodel=large' only allowed with '-fno-pic'
 // ERR-AARCH64_32: error: unsupported argument 'small' to option '-mcmodel=' for target 'aarch64_32-unknown-linux'
 
-// ERR-LOONGARCH64-PLT-LARGE: error: invalid argument '-mcmodel=large' not allowed with '-fplt'
 // ERR-LOONGARCH64-PLT-EXTREME: error: invalid argument '-mcmodel=extreme' not allowed with '-fplt'

From c9aa55da62b2a9e482c1877897152fb3c47719d2 Mon Sep 17 00:00:00 2001
From: Kunwar Grover <groverkss@gmail.com>
Date: Wed, 11 Sep 2024 09:30:05 +0100
Subject: [PATCH 067/114] [mlir][Linalg] Add speculation for
 LinalgStructuredOps (#108032)

This patch adds speculation behavior for linalg structured ops, allowing
them to be hoisted out of loops using LICM.
---
 .../Dialect/Linalg/IR/LinalgStructuredOps.td  |  1 +
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      | 31 +++++++
 .../loop-invariant-code-motion.mlir           | 91 +++++++++++++++++++
 .../mlir-linalg-ods-yaml-gen.cpp              |  5 +-
 4 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index ac61117c3d6e3..31f2913924726 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -29,6 +29,7 @@ class LinalgStructuredBase_Op<string mnemonic, list<Trait> props>
   : Op<Linalg_Dialect, mnemonic, !listconcat([
        SingleBlockImplicitTerminator<"YieldOp">,
        DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+       DeclareOpInterfaceMethods<ConditionallySpeculatable>,
        DestinationStyleOpInterface,
        LinalgStructuredInterface,
        ReifyRankedShapedTypeOpInterface], props)> {
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 76df3ecf2d2bd..630985d76a0eb 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -34,6 +34,7 @@
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
@@ -1202,6 +1203,20 @@ void GenericOp::getEffects(
   getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
 
+static Speculation::Speculatability
+getGenericSpeculatabilityImpl(LinalgOp linalgOp) {
+  // Operands with value semantics are speculatable, while operands with memory
+  // semantics are not.
+  if (!linalgOp.hasPureTensorSemantics())
+    return Speculation::NotSpeculatable;
+  // The body of the op can still have speculation in its region.
+  return Speculation::RecursivelySpeculatable;
+}
+
+Speculation::Speculatability GenericOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
 LogicalResult GenericOp::verify() { return success(); }
 
 namespace {
@@ -1553,6 +1568,10 @@ void MapOp::getEffects(
   getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
 
+Speculation::Speculatability MapOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
 //===----------------------------------------------------------------------===//
 // ReduceOp
 //===----------------------------------------------------------------------===//
@@ -1621,6 +1640,10 @@ void ReduceOp::getEffects(
   getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
 
+Speculation::Speculatability ReduceOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
 static ParseResult parseDenseI64ArrayAttr(OpAsmParser &parser,
                                           NamedAttrList &attributes,
                                           StringRef attributeName) {
@@ -1906,6 +1929,10 @@ void TransposeOp::getEffects(
   getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
 
+Speculation::Speculatability TransposeOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
 LogicalResult TransposeOp::fold(FoldAdaptor adaptor,
                                 SmallVectorImpl<OpFoldResult> &result) {
   // Only the tensor type is supported.
@@ -2134,6 +2161,10 @@ void BroadcastOp::getEffects(
   getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
 
+Speculation::Speculatability BroadcastOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
 void BroadcastOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                               MLIRContext *context) {
   results.add<EraseIdentityLinalgOp<BroadcastOp>>(context);
diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir
index 47a49465e8a7c..57f4ece9c9f2a 100644
--- a/mlir/test/Transforms/loop-invariant-code-motion.mlir
+++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir
@@ -1118,3 +1118,94 @@ func.func @hoist_from_scf_while(%arg0: i32, %arg1: i32) -> i32 {
   }
   return %0 : i32
 }
+
+// -----
+
+#trait = {
+  indexing_maps = [
+    affine_map<(m, n, k) -> (m, k)>,
+    affine_map<(m, n, k) -> (k, n)>,
+    affine_map<(m, n, k) -> (m, n)>
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"] 
+}
+
+// CHECK-LABEL: func @hoist_linalg_ops
+// CHECK: linalg.generic
+// CHECK: scf.for
+// CHECK-NOT: linalg.generic
+// CHECK: tensor.insert_slice
+// CHECK: scf.yield
+func.func @hoist_linalg_ops(%a : tensor<128x128xf32>, 
+                            %b : tensor<128x128xf32>, 
+                            %c: tensor<128x128xf32>,
+                            %lb : index,
+                            %ub : index,
+                            %step : index,
+                            %output : tensor<?x128xf32>) -> tensor<?x128xf32> {
+  %final = 
+  scf.for %i = %lb to %ub step %step iter_args(%acc = %output) 
+                                            -> tensor<?x128xf32> {
+    %compute = linalg.generic #trait
+               ins(%a, %b : tensor<128x128xf32>, tensor<128x128xf32>) 
+               outs(%c : tensor<128x128xf32>) {
+    ^bb0(%in : f32, %in2 : f32, %in3 : f32):
+      %mul = arith.mulf %in, %in2 : f32
+      %add = arith.addf %mul, %in3 : f32
+      linalg.yield %in3 : f32
+    } -> tensor<128x128xf32>
+
+    %newacc = tensor.insert_slice %compute into 
+                                  %output[%i, 0][128, 128][1, 1] 
+                                  : tensor<128x128xf32> into tensor<?x128xf32>
+    scf.yield %newacc : tensor<?x128xf32>
+  }
+
+  func.return %final : tensor<?x128xf32>
+}
+
+// -----
+
+#trait = {
+  indexing_maps = [
+    affine_map<(m, n, k) -> (m, k)>,
+    affine_map<(m, n, k) -> (k, n)>,
+    affine_map<(m, n, k) -> (m, n)>
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"] 
+}
+
+// CHECK-LABEL: func @hoist_linalg_ops_div_by_zero
+// CHECK-NOT: linalg.generic
+// CHECK: scf.for
+// CHECK: linalg.generic
+// CHECK: tensor.insert_slice
+// CHECK: scf.yield
+func.func @hoist_linalg_ops_div_by_zero(%a : tensor<128x128xi32>, 
+                            %b : tensor<128x128xi32>, 
+                            %c: tensor<128x128xi32>,
+                            %lb : index,
+                            %ub : index,
+                            %step : index,
+                            %output : tensor<?x128xi32>) -> tensor<?x128xi32> {
+  %cst0 = arith.constant 0 : i32
+  %final = 
+  scf.for %i = %lb to %ub step %step iter_args(%acc = %output) 
+                                            -> tensor<?x128xi32> {
+    %compute = linalg.generic #trait
+               ins(%a, %b : tensor<128x128xi32>, tensor<128x128xi32>) 
+               outs(%c : tensor<128x128xi32>) {
+    ^bb0(%in : i32, %in2 : i32, %in3 : i32):
+      %div = arith.divui %in, %in2 : i32
+      %add = arith.addi %div, %in3 : i32
+      linalg.yield %in3 : i32
+    } -> tensor<128x128xi32>
+
+    %newacc = tensor.insert_slice %compute into 
+                                  %output[%i, 0][128, 128][1, 1] 
+                                  : tensor<128x128xi32> into tensor<?x128xi32>
+    scf.yield %newacc : tensor<?x128xi32>
+  }
+
+  func.return %final : tensor<?x128xi32>
+}
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
index a00f12661f712..7d42c03469dc9 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
@@ -656,7 +656,7 @@ ArrayAttr {0}::getIndexingMaps() {{
 }
 )FMT";
 
-// Implementations of fold and getEffects.
+// Implementations of fold, getEffects and getSpeculatability.
 // Parameters:
 // {0}: Class name
 const char structuredOpFoldersFormat[] = R"FMT(
@@ -669,6 +669,9 @@ void {0}::getEffects(SmallVectorImpl<
       if (hasPureTensorSemantics()) return;
       getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
 }
+Speculation::Speculatability {0}::getSpeculatability() {{
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
 )FMT";
 
 // Implementation of parse/print.

From db64e69fa250ea3a8d7a761220a7922fbdad0f2c Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Wed, 11 Sep 2024 09:31:53 +0100
Subject: [PATCH 068/114] [flang][debug] Handle 'used' module. (#107626)

As described in #98883, we have to qualify a module variable name in
debugger to get its value. This PR tries to remove this limitation.

LLVM provides `DIImportedEntity` to handle such cases but the PR is made
more complicated due to the following 2 issues.

1. The MLIR attributes are readonly and we have a circular dependency
here. This has to be handled using the recursive interface provided by
the MLIR. This requires us to first create a place holder
`DISubprogramAttr` which is used in creating `DIImportedEntityAttr`.
Later another `DISubprogramAttr` is created which replaces the place
holder.

2. The flang IR does not provide any information about the 'used' module
so this has to be extracted by doing a pass over the
`DeclareOp` in the function. This presents certain limitation as 'only'
and module variable renaming may not be handled properly.

Due to the change in `DISubprogramAttr`, some tests also needed to be
adjusted.

Fixes #98883.
---
 .../lib/Optimizer/Transforms/AddDebugInfo.cpp | 145 ++++++++++++++----
 flang/test/Integration/debug-module-2.f90     |   2 +-
 flang/test/Transforms/debug-90683.fir         |   2 +-
 flang/test/Transforms/debug-fn-info.fir       |   6 +-
 .../test/Transforms/debug-imported-entity.fir |  30 ++++
 .../Transforms/debug-line-table-inc-file.fir  |   4 +-
 .../debug-local-global-storage-1.fir          |   2 +-
 7 files changed, 150 insertions(+), 41 deletions(-)
 create mode 100644 flang/test/Transforms/debug-imported-entity.fir

diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 576e65ba6ecc5..46e70d7ef9180 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -72,6 +72,10 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
                     mlir::LLVM::DICompileUnitAttr cuAttr,
                     fir::DebugTypeGenerator &typeGen,
                     mlir::SymbolTable *symbolTable);
+  std::optional<mlir::LLVM::DIModuleAttr>
+  getModuleAttrFromGlobalOp(fir::GlobalOp globalOp,
+                            mlir::LLVM::DIFileAttr fileAttr,
+                            mlir::LLVM::DIScopeAttr scope);
 };
 
 bool debugInfoIsAlreadySet(mlir::Location loc) {
@@ -152,6 +156,45 @@ mlir::LLVM::DIModuleAttr AddDebugInfoPass::getOrCreateModuleAttr(
   return modAttr;
 }
 
+/// If globalOp represents a module variable, return a ModuleAttr that
+/// represents that module.
+std::optional<mlir::LLVM::DIModuleAttr>
+AddDebugInfoPass::getModuleAttrFromGlobalOp(fir::GlobalOp globalOp,
+                                            mlir::LLVM::DIFileAttr fileAttr,
+                                            mlir::LLVM::DIScopeAttr scope) {
+  mlir::MLIRContext *context = &getContext();
+  mlir::OpBuilder builder(context);
+
+  std::pair result = fir::NameUniquer::deconstruct(globalOp.getSymName());
+  // Only look for module if this variable is not part of a function.
+  if (!result.second.procs.empty() || result.second.modules.empty())
+    return std::nullopt;
+
+  // DWARF5 says following about the fortran modules:
+  // A Fortran 90 module may also be represented by a module entry
+  // (but no declaration attribute is warranted because Fortran has no concept
+  // of a corresponding module body).
+  // But in practice, compilers use declaration attribute with a module in cases
+  // where module was defined in another source file (only being used in this
+  // one). The isInitialized() seems to provide the right information
+  // but inverted. It is true where module is actually defined but false where
+  // it is used.
+  // FIXME: Currently we don't have the line number on which a module was
+  // declared. We are using a best guess of line - 1 where line is the source
+  // line of the first member of the module that we encounter.
+  unsigned line = getLineFromLoc(globalOp.getLoc());
+
+  mlir::LLVM::DISubprogramAttr sp =
+      mlir::dyn_cast_if_present<mlir::LLVM::DISubprogramAttr>(scope);
+  // Modules are generated at compile unit scope
+  if (sp)
+    scope = sp.getCompileUnit();
+
+  return getOrCreateModuleAttr(result.second.modules[0], fileAttr, scope,
+                               std::max(line - 1, (unsigned)1),
+                               !globalOp.isInitialized());
+}
+
 void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp,
                                       mlir::LLVM::DIFileAttr fileAttr,
                                       mlir::LLVM::DIScopeAttr scope,
@@ -174,33 +217,11 @@ void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp,
     return;
 
   unsigned line = getLineFromLoc(globalOp.getLoc());
+  std::optional<mlir::LLVM::DIModuleAttr> modOpt =
+      getModuleAttrFromGlobalOp(globalOp, fileAttr, scope);
+  if (modOpt)
+    scope = *modOpt;
 
-  // DWARF5 says following about the fortran modules:
-  // A Fortran 90 module may also be represented by a module entry
-  // (but no declaration attribute is warranted because Fortran has no concept
-  // of a corresponding module body).
-  // But in practice, compilers use declaration attribute with a module in cases
-  // where module was defined in another source file (only being used in this
-  // one). The isInitialized() seems to provide the right information
-  // but inverted. It is true where module is actually defined but false where
-  // it is used.
-  // FIXME: Currently we don't have the line number on which a module was
-  // declared. We are using a best guess of line - 1 where line is the source
-  // line of the first member of the module that we encounter.
-
-  if (result.second.procs.empty()) {
-    // Only look for module if this variable is not part of a function.
-    if (result.second.modules.empty())
-      return;
-
-    // Modules are generated at compile unit scope
-    if (mlir::LLVM::DISubprogramAttr sp =
-            mlir::dyn_cast_if_present<mlir::LLVM::DISubprogramAttr>(scope))
-      scope = sp.getCompileUnit();
-
-    scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, scope,
-                                  line - 1, !globalOp.isInitialized());
-  }
   mlir::LLVM::DITypeAttr diType =
       typeGen.convertType(globalOp.getType(), fileAttr, scope, declOp);
   auto gvAttr = mlir::LLVM::DIGlobalVariableAttr::get(
@@ -262,7 +283,7 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
       mlir::LLVM::DIFileAttr::get(context, fileName, filePath);
 
   // Only definitions need a distinct identifier and a compilation unit.
-  mlir::DistinctAttr id;
+  mlir::DistinctAttr id, id2;
   mlir::LLVM::DIScopeAttr Scope = fileAttr;
   mlir::LLVM::DICompileUnitAttr compilationUnit;
   mlir::LLVM::DISubprogramFlags subprogramFlags =
@@ -270,7 +291,10 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
   if (isOptimized)
     subprogramFlags = mlir::LLVM::DISubprogramFlags::Optimized;
   if (!funcOp.isExternal()) {
+    // Place holder and final function have to have different IDs, otherwise
+    // translation code will reject one of them.
     id = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
+    id2 = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
     compilationUnit = cuAttr;
     subprogramFlags =
         subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition;
@@ -299,14 +323,69 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
                                   line - 1, false);
   }
 
-  auto spAttr = mlir::LLVM::DISubprogramAttr::get(
-      context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr,
-      line, line, subprogramFlags, subTypeAttr, /*retainedNodes=*/{});
-  funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
-
   // Don't process variables if user asked for line tables only.
-  if (debugLevel == mlir::LLVM::DIEmissionKind::LineTablesOnly)
+  if (debugLevel == mlir::LLVM::DIEmissionKind::LineTablesOnly) {
+    auto spAttr = mlir::LLVM::DISubprogramAttr::get(
+        context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr,
+        line, line, subprogramFlags, subTypeAttr, /*retainedNodes=*/{});
+    funcOp->setLoc(builder.getFusedLoc({l}, spAttr));
     return;
+  }
+
+  mlir::DistinctAttr recId =
+      mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
+
+  // The debug attribute in MLIR are readonly once created. But in case of
+  // imported entities, we have a circular dependency. The
+  // DIImportedEntityAttr requires scope information (DISubprogramAttr in this
+  // case) and DISubprogramAttr requires the list of imported entities. The
+  // MLIR provides a way where a DISubprogramAttr an be created with a certain
+  // recID and be used in places like DIImportedEntityAttr. After that another
+  // DISubprogramAttr can be created with same recID but with list of entities
+  // now available. The MLIR translation code takes care of updating the
+  // references. Note that references will be updated only in the things that
+  // are part of DISubprogramAttr (like DIImportedEntityAttr) so we have to
+  // create the final DISubprogramAttr before we process local variables.
+  // Look at DIRecursiveTypeAttrInterface for more details.
+
+  auto spAttr = mlir::LLVM::DISubprogramAttr::get(
+      context, recId, /*isRecSelf=*/true, id, compilationUnit, Scope, funcName,
+      fullName, funcFileAttr, line, line, subprogramFlags, subTypeAttr,
+      /*retainedNodes=*/{});
+
+  // There is no direct information in the IR for any 'use' statement in the
+  // function. We have to extract that information from the DeclareOp. We do
+  // a pass on the DeclareOp and generate ModuleAttr and corresponding
+  // DIImportedEntityAttr for that module.
+  // FIXME: As we are depending on the variables to see which module is being
+  // 'used' in the function, there are certain limitations.
+  // For things like 'use mod1, only: v1', whole module will be brought into the
+  // namespace in the debug info. It is not a problem as such unless there is a
+  // clash of names.
+  // There is no information about module variable renaming
+  llvm::DenseSet<mlir::LLVM::DIImportedEntityAttr> importedModules;
+  funcOp.walk([&](fir::cg::XDeclareOp declOp) {
+    if (&funcOp.front() == declOp->getBlock())
+      if (auto global =
+              symbolTable->lookup<fir::GlobalOp>(declOp.getUniqName())) {
+        std::optional<mlir::LLVM::DIModuleAttr> modOpt =
+            getModuleAttrFromGlobalOp(global, fileAttr, cuAttr);
+        if (modOpt) {
+          auto importedEntity = mlir::LLVM::DIImportedEntityAttr::get(
+              context, llvm::dwarf::DW_TAG_imported_module, spAttr, *modOpt,
+              fileAttr, /*line=*/1, /*name=*/nullptr, /*elements*/ {});
+          importedModules.insert(importedEntity);
+        }
+      }
+  });
+  llvm::SmallVector<mlir::LLVM::DINodeAttr> entities(importedModules.begin(),
+                                                     importedModules.end());
+  // We have the imported entities now. Generate the final DISubprogramAttr.
+  spAttr = mlir::LLVM::DISubprogramAttr::get(
+      context, recId, /*isRecSelf=*/false, id2, compilationUnit, Scope,
+      funcName, fullName, funcFileAttr, line, line, subprogramFlags,
+      subTypeAttr, entities);
+  funcOp->setLoc(builder.getFusedLoc({l}, spAttr));
 
   funcOp.walk([&](fir::cg::XDeclareOp declOp) {
     // FIXME: We currently dont handle variables that are not in the entry
diff --git a/flang/test/Integration/debug-module-2.f90 b/flang/test/Integration/debug-module-2.f90
index 60fccaa2a6c1f..f07416c3ef3cc 100644
--- a/flang/test/Integration/debug-module-2.f90
+++ b/flang/test/Integration/debug-module-2.f90
@@ -17,7 +17,7 @@ module helper
   integer gli
 
   contains
-!CHECK-DAG: !DISubprogram(name: "test", linkageName: "_QMhelperPtest", scope: ![[MOD]], file: ![[FILE2]], line: [[@LINE+1]]{{.*}}unit: ![[CU]])
+!CHECK-DAG: !DISubprogram(name: "test", linkageName: "_QMhelperPtest", scope: ![[MOD]], file: ![[FILE2]], line: [[@LINE+1]]{{.*}}unit: ![[CU]]{{.*}})
     subroutine test()
     glr = 12.34
     gli = 67
diff --git a/flang/test/Transforms/debug-90683.fir b/flang/test/Transforms/debug-90683.fir
index cc6929c10411f..a21332e3968a7 100644
--- a/flang/test/Transforms/debug-90683.fir
+++ b/flang/test/Transforms/debug-90683.fir
@@ -22,4 +22,4 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
 
 // CHECK-DAG: #[[TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
 // CHECK-DAG: #[[TY1:.*]] = #llvm.di_subroutine_type<callingConvention = DW_CC_normal, types = #[[TY]], #[[TY]], #[[TY]]>
-// CHECK-DAG: #{{.*}} = #llvm.di_subprogram<scope = #{{.*}}, name = "cabs", linkageName = "cabs", file = #{{.*}}, line = {{.*}}, scopeLine = {{.*}}, type = #[[TY1]]>
+// CHECK-DAG: #{{.*}} = #llvm.di_subprogram<{{.*}}name = "cabs", linkageName = "cabs"{{.*}}, type = #[[TY1]]>
diff --git a/flang/test/Transforms/debug-fn-info.fir b/flang/test/Transforms/debug-fn-info.fir
index f456e35d3dd70..5433e088a648d 100644
--- a/flang/test/Transforms/debug-fn-info.fir
+++ b/flang/test/Transforms/debug-fn-info.fir
@@ -69,7 +69,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
 // CHECK: #[[TY2:.*]] = #llvm.di_subroutine_type<callingConvention = DW_CC_normal, types = #[[INT4]], #[[INT8]], #[[REAL4]], #[[LOG4]]>
 
 // Line numbers should match the number in corresponding loc entry.
-// CHECK: #llvm.di_subprogram<id = {{.*}}, compileUnit = {{.*}}, scope = {{.*}}, name = "_QQmain", linkageName = "_QQmain", file = {{.*}}, line = 15, scopeLine = 15, subprogramFlags = Definition, type = #[[TY0]]>
-// CHECK: #llvm.di_subprogram<id = {{.*}}, compileUnit = {{.*}}, scope = {{.*}}, name = "fn1", linkageName = "_QFPfn1", file = {{.*}}, line = 26, scopeLine = 26, subprogramFlags = Definition, type = #[[TY1]]>
-// CHECK: #llvm.di_subprogram<id = {{.*}}, compileUnit = {{.*}}, scope = {{.*}}, name = "fn2", linkageName = "_QFPfn2", file = {{.*}}, line = 43, scopeLine = 43, subprogramFlags = Definition, type = #[[TY2]]>
+// CHECK: #llvm.di_subprogram<{{.*}}name = "_QQmain", linkageName = "_QQmain", file = {{.*}}, line = 15, scopeLine = 15, subprogramFlags = Definition, type = #[[TY0]]>
+// CHECK: #llvm.di_subprogram<{{.*}}name = "fn1", linkageName = "_QFPfn1", file = {{.*}}, line = 26, scopeLine = 26, subprogramFlags = Definition, type = #[[TY1]]>
+// CHECK: #llvm.di_subprogram<{{.*}}name = "fn2", linkageName = "_QFPfn2", file = {{.*}}, line = 43, scopeLine = 43, subprogramFlags = Definition, type = #[[TY2]]>
 
diff --git a/flang/test/Transforms/debug-imported-entity.fir b/flang/test/Transforms/debug-imported-entity.fir
new file mode 100644
index 0000000000000..7be6531a703a8
--- /dev/null
+++ b/flang/test/Transforms/debug-imported-entity.fir
@@ -0,0 +1,30 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
+  fir.global @_QMfooEv1 : i32 {
+    %0 = fir.zero_bits i32
+    fir.has_value %0 : i32
+  }
+  fir.global internal @_QFtestExyz : i32 {
+    %c12_i32 = arith.constant 12 : i32
+    fir.has_value %c12_i32 : i32
+  } loc(#loc4)
+  func.func @test() attributes {fir.bindc_name = "test"} {
+    %0 = fir.address_of(@_QMfooEv1) : !fir.ref<i32>
+    %1 = fircg.ext_declare %0 {uniq_name = "_QMfooEv1"} : (!fir.ref<i32>) -> !fir.ref<i32> loc(#loc1)
+    %4 = fir.address_of(@_QFtestExyz) : !fir.ref<i32>
+    %5 = fircg.ext_declare %4 {uniq_name = "_QFtestExyz"} : (!fir.ref<i32>) -> !fir.ref<i32> loc(#loc4)
+    return
+  } loc(#loc3)
+}
+#loc1 = loc("test.f90":2:14)
+#loc2 = loc("test.f90":6:1)
+#loc3 = loc("test.f90":10:1)
+#loc4 = loc("test.f90":13:1)
+
+// CHECK: #[[MOD:.+]] = #llvm.di_module<{{.*}}name = "foo"{{.*}}>
+// CHECK: #[[SP_REC:.+]] = #llvm.di_subprogram<recId = distinct[[[REC_ID:[0-9]+]]]<>, isRecSelf = true{{.*}}>
+// CHECK: #[[IMP_ENTITY:.+]] = #llvm.di_imported_entity<tag = DW_TAG_imported_module, scope = #[[SP_REC]], entity = #[[MOD]]{{.*}}>
+// CHECK: #[[SP:.+]] = #llvm.di_subprogram<recId = distinct[[[REC_ID]]]<>{{.*}}retainedNodes = #[[IMP_ENTITY]]>
+// CHECK: #llvm.di_global_variable<scope = #[[SP]], name = "xyz"{{.*}}>
diff --git a/flang/test/Transforms/debug-line-table-inc-file.fir b/flang/test/Transforms/debug-line-table-inc-file.fir
index 065039b59c5ae..216cd5e016f2f 100644
--- a/flang/test/Transforms/debug-line-table-inc-file.fir
+++ b/flang/test/Transforms/debug-line-table-inc-file.fir
@@ -31,7 +31,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
 // CHECK: #[[LOC_INC_FILE:.*]] = loc("{{.*}}inc.f90":1:1)
 // CHECK: #[[LOC_FILE:.*]] = loc("{{.*}}simple.f90":3:1)
 // CHECK: #[[DI_CU:.*]] = #llvm.di_compile_unit<id = distinct[{{.*}}]<>, sourceLanguage = DW_LANG_Fortran95, file = #[[DI_FILE]], producer = "{{.*}}flang{{.*}}", isOptimized = false, emissionKind = LineTablesOnly>
-// CHECK: #[[DI_SP_INC:.*]] = #llvm.di_subprogram<id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "sinc", linkageName = "_QPsinc", file = #[[DI_INC_FILE]], {{.*}}>
-// CHECK: #[[DI_SP:.*]] = #llvm.di_subprogram<id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "_QQmain", linkageName = "_QQmain", file = #[[DI_FILE]], {{.*}}>
+// CHECK: #[[DI_SP_INC:.*]] = #llvm.di_subprogram<{{.*}}id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "sinc", linkageName = "_QPsinc", file = #[[DI_INC_FILE]], {{.*}}>
+// CHECK: #[[DI_SP:.*]] = #llvm.di_subprogram<{{.*}}id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "_QQmain", linkageName = "_QQmain", file = #[[DI_FILE]], {{.*}}>
 // CHECK: #[[FUSED_LOC_INC_FILE]] = loc(fused<#[[DI_SP_INC]]>[#[[LOC_INC_FILE]]])
 // CHECK: #[[FUSED_LOC_FILE]] = loc(fused<#[[DI_SP]]>[#[[LOC_FILE]]])
diff --git a/flang/test/Transforms/debug-local-global-storage-1.fir b/flang/test/Transforms/debug-local-global-storage-1.fir
index d9d8083a14709..83a9055a6b8dc 100644
--- a/flang/test/Transforms/debug-local-global-storage-1.fir
+++ b/flang/test/Transforms/debug-local-global-storage-1.fir
@@ -45,7 +45,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i64, dense<64> :
 // CHECK-DAG: #[[CU:.*]] = #llvm.di_compile_unit<{{.*}}>
 // CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}scope = #[[CU]]{{.*}}name = "example"{{.*}}>
 // CHECK-DAG: #[[SP:.*]] = #llvm.di_subprogram<{{.*}}name = "_QQmain"{{.*}}>
-// CHECK-DAG: #[[MOD_SP:.*]] = #llvm.di_subprogram<{{.*}}name = "mod_sub"{{.*}}>
+// CHECK-DAG: #[[MOD_SP:.*]] = #llvm.di_subprogram<{{.*}}name = "mod_sub"{{.*}}retainedNodes = {{.*}}>
 // CHECK-DAG: #llvm.di_global_variable<scope = #[[SP]], name = "arr"{{.*}}line = 22{{.*}}>
 // CHECK-DAG: #llvm.di_global_variable<scope = #[[SP]], name = "s"{{.*}}line = 23{{.*}}>
 // CHECK-DAG: #llvm.di_global_variable<scope = #[[MOD_SP]], name = "ss"{{.*}}line = 12{{.*}}>

From 300161761df54f5f85630a8ad0e170d09d119ee3 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 11 Sep 2024 09:34:15 +0100
Subject: [PATCH 069/114] [AArch64] Add tests for scalar_to_vector(load) and
 extend load into zero tests. NFC

---
 .../test/CodeGen/AArch64/load-insert-undef.ll | 1098 +++++++++++++++++
 llvm/test/CodeGen/AArch64/load-insert-zero.ll |  323 ++++-
 2 files changed, 1420 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/load-insert-undef.ll

diff --git a/llvm/test/CodeGen/AArch64/load-insert-undef.ll b/llvm/test/CodeGen/AArch64/load-insert-undef.ll
new file mode 100644
index 0000000000000..1e776d1c06fcb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/load-insert-undef.ll
@@ -0,0 +1,1098 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+bf16,+sve | FileCheck %s
+
+define <8 x i8> @loadv8i8(ptr %p) {
+; CHECK-LABEL: loadv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i8, ptr %p
+  %v = insertelement <8 x i8> poison, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8(ptr %p) {
+; CHECK-LABEL: loadv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i8, ptr %p
+  %v = insertelement <16 x i8> poison, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16(ptr %p) {
+; CHECK-LABEL: loadv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i16, ptr %p
+  %v = insertelement <4 x i16> poison, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16(ptr %p) {
+; CHECK-LABEL: loadv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i16, ptr %p
+  %v = insertelement <8 x i16> poison, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32(ptr %p) {
+; CHECK-LABEL: loadv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i32, ptr %p
+  %v = insertelement <2 x i32> poison, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32(ptr %p) {
+; CHECK-LABEL: loadv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i32, ptr %p
+  %v = insertelement <4 x i32> poison, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64(ptr %p) {
+; CHECK-LABEL: loadv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %l = load i64, ptr %p
+  %v = insertelement <2 x i64> poison, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+
+define <4 x half> @loadv4f16(ptr %p) {
+; CHECK-LABEL: loadv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load half, ptr %p
+  %v = insertelement <4 x half> poison, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16(ptr %p) {
+; CHECK-LABEL: loadv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load half, ptr %p
+  %v = insertelement <8 x half> poison, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16(ptr %p) {
+; CHECK-LABEL: loadv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load bfloat, ptr %p
+  %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16(ptr %p) {
+; CHECK-LABEL: loadv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load bfloat, ptr %p
+  %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32(ptr %p) {
+; CHECK-LABEL: loadv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load float, ptr %p
+  %v = insertelement <2 x float> poison, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32(ptr %p) {
+; CHECK-LABEL: loadv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load float, ptr %p
+  %v = insertelement <4 x float> poison, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64(ptr %p) {
+; CHECK-LABEL: loadv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %l = load double, ptr %p
+  %v = insertelement <2 x double> poison, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
+; Unscaled
+
+define <8 x i8> @loadv8i8_offset(ptr %p) {
+; CHECK-LABEL: loadv8i8_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> poison, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_offset(ptr %p) {
+; CHECK-LABEL: loadv16i8_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> poison, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_offset(ptr %p) {
+; CHECK-LABEL: loadv4i16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> poison, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_offset(ptr %p) {
+; CHECK-LABEL: loadv8i16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> poison, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_offset(ptr %p) {
+; CHECK-LABEL: loadv2i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> poison, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_offset(ptr %p) {
+; CHECK-LABEL: loadv4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> poison, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_offset(ptr %p) {
+; CHECK-LABEL: loadv2i64_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur x8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> poison, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+
+define <4 x half> @loadv4f16_offset(ptr %p) {
+; CHECK-LABEL: loadv4f16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> poison, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_offset(ptr %p) {
+; CHECK-LABEL: loadv8f16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> poison, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_offset(ptr %p) {
+; CHECK-LABEL: loadv4bf16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_offset(ptr %p) {
+; CHECK-LABEL: loadv8bf16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_offset(ptr %p) {
+; CHECK-LABEL: loadv2f32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> poison, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_offset(ptr %p) {
+; CHECK-LABEL: loadv4f32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> poison, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_offset(ptr %p) {
+; CHECK-LABEL: loadv2f64_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> poison, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
+define <8 x i8> @loadv8i8_noffset(ptr %p) {
+; CHECK-LABEL: loadv8i8_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurb w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> poison, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_noffset(ptr %p) {
+; CHECK-LABEL: loadv16i8_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurb w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> poison, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_noffset(ptr %p) {
+; CHECK-LABEL: loadv4i16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> poison, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_noffset(ptr %p) {
+; CHECK-LABEL: loadv8i16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> poison, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_noffset(ptr %p) {
+; CHECK-LABEL: loadv2i32_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> poison, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_noffset(ptr %p) {
+; CHECK-LABEL: loadv4i32_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #-1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> poison, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_noffset(ptr %p) {
+; CHECK-LABEL: loadv2i64_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur x8, [x0, #-1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> poison, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+
+define <4 x half> @loadv4f16_noffset(ptr %p) {
+; CHECK-LABEL: loadv4f16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> poison, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_noffset(ptr %p) {
+; CHECK-LABEL: loadv8f16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> poison, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_noffset(ptr %p) {
+; CHECK-LABEL: loadv4bf16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_noffset(ptr %p) {
+; CHECK-LABEL: loadv8bf16_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_noffset(ptr %p) {
+; CHECK-LABEL: loadv2f32_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> poison, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_noffset(ptr %p) {
+; CHECK-LABEL: loadv4f32_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> poison, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_noffset(ptr %p) {
+; CHECK-LABEL: loadv2f64_noffset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #-1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 -1
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> poison, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
+; ROW addressing modes
+
+define <8 x i8> @loadv8i8_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8i8_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, w1, sxtw]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i32 %o
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> poison, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv16i8_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, w1, sxtw]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i32 %o
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> poison, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4i16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i32 %o
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> poison, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8i16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i32 %o
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> poison, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2i32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i32 %o
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> poison, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4i32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i32 %o
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> poison, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2i64_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, w1, sxtw #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i32 %o
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> poison, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+define <4 x half> @loadv4f16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4f16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i32 %o
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> poison, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8f16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i32 %o
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> poison, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4bf16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i32 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8bf16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i32 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2f32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i32 %o
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> poison, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4f32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i32 %o
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> poison, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2f64_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, w1, sxtw #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds double, ptr %p, i32 %o
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> poison, double %l, i32 0
+  ret <2 x double> %v
+}
+
+; roX
+
+define <8 x i8> @loadv8i8_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8i8_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 %o
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> poison, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv16i8_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 %o
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> poison, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4i16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i64 %o
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> poison, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8i16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i64 %o
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> poison, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2i32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i64 %o
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> poison, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4i32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i64 %o
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> poison, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2i64_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i64 %o
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> poison, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+define <4 x half> @loadv4f16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4f16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i64 %o
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> poison, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8f16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i64 %o
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> poison, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4bf16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i64 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8bf16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i64 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2f32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i64 %o
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> poison, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4f32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i64 %o
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> poison, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2f64_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds double, ptr %p, i64 %o
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> poison, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
+; SVE
+
+define <vscale x 8 x i8> @loadnxv8i8(ptr %p) {
+; CHECK-LABEL: loadnxv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %l = load i8, ptr %p
+  %v = insertelement <vscale x 8 x i8> poison, i8 %l, i32 0
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 16 x i8> @loadnxv16i8(ptr %p) {
+; CHECK-LABEL: loadnxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %l = load i8, ptr %p
+  %v = insertelement <vscale x 16 x i8> poison, i8 %l, i32 0
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 4 x i16> @loadnxv4i16(ptr %p) {
+; CHECK-LABEL: loadnxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %l = load i16, ptr %p
+  %v = insertelement <vscale x 4 x i16> poison, i16 %l, i32 0
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 8 x i16> @loadnxv8i16(ptr %p) {
+; CHECK-LABEL: loadnxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %l = load i16, ptr %p
+  %v = insertelement <vscale x 8 x i16> poison, i16 %l, i32 0
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 2 x i32> @loadnxv2i32(ptr %p) {
+; CHECK-LABEL: loadnxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %l = load i32, ptr %p
+  %v = insertelement <vscale x 2 x i32> poison, i32 %l, i32 0
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 4 x i32> @loadnxv4i32(ptr %p) {
+; CHECK-LABEL: loadnxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %l = load i32, ptr %p
+  %v = insertelement <vscale x 4 x i32> poison, i32 %l, i32 0
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 2 x i64> @loadnxv2i64(ptr %p) {
+; CHECK-LABEL: loadnxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %l = load i64, ptr %p
+  %v = insertelement <vscale x 2 x i64> poison, i64 %l, i32 0
+  ret <vscale x 2 x i64> %v
+}
+
+
+define <vscale x 4 x half> @loadnxv4f16(ptr %p) {
+; CHECK-LABEL: loadnxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load half, ptr %p
+  %v = insertelement <vscale x 4 x half> poison, half %l, i32 0
+  ret <vscale x 4 x half> %v
+}
+
+define <vscale x 8 x half> @loadnxv8f16(ptr %p) {
+; CHECK-LABEL: loadnxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load half, ptr %p
+  %v = insertelement <vscale x 8 x half> poison, half %l, i32 0
+  ret <vscale x 8 x half> %v
+}
+
+define <vscale x 4 x bfloat> @loadnxv4bf16(ptr %p) {
+; CHECK-LABEL: loadnxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load bfloat, ptr %p
+  %v = insertelement <vscale x 4 x bfloat> poison, bfloat %l, i32 0
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @loadnxv8bf16(ptr %p) {
+; CHECK-LABEL: loadnxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %l = load bfloat, ptr %p
+  %v = insertelement <vscale x 8 x bfloat> poison, bfloat %l, i32 0
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 2 x float> @loadnxv2f32(ptr %p) {
+; CHECK-LABEL: loadnxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load float, ptr %p
+  %v = insertelement <vscale x 2 x float> poison, float %l, i32 0
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 4 x float> @loadnxv4f32(ptr %p) {
+; CHECK-LABEL: loadnxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %l = load float, ptr %p
+  %v = insertelement <vscale x 4 x float> poison, float %l, i32 0
+  ret <vscale x 4 x float> %v
+}
+
+define <vscale x 2 x double> @loadnxv2f64(ptr %p) {
+; CHECK-LABEL: loadnxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %l = load double, ptr %p
+  %v = insertelement <vscale x 2 x double> poison, double %l, i32 0
+  ret <vscale x 2 x double> %v
+}
+
+
+; Unscaled
+
+define <vscale x 8 x i8> @loadnxv8i8_offset(ptr %p) {
+; CHECK-LABEL: loadnxv8i8_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i8, ptr %g
+  %v = insertelement <vscale x 8 x i8> poison, i8 %l, i32 0
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 16 x i8> @loadnxv16i8_offset(ptr %p) {
+; CHECK-LABEL: loadnxv16i8_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i8, ptr %g
+  %v = insertelement <vscale x 16 x i8> poison, i8 %l, i32 0
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 4 x i16> @loadnxv4i16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv4i16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i16, ptr %g
+  %v = insertelement <vscale x 4 x i16> poison, i16 %l, i32 0
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 8 x i16> @loadnxv8i16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv8i16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i16, ptr %g
+  %v = insertelement <vscale x 8 x i16> poison, i16 %l, i32 0
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 2 x i32> @loadnxv2i32_offset(ptr %p) {
+; CHECK-LABEL: loadnxv2i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i32, ptr %g
+  %v = insertelement <vscale x 2 x i32> poison, i32 %l, i32 0
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 4 x i32> @loadnxv4i32_offset(ptr %p) {
+; CHECK-LABEL: loadnxv4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i32, ptr %g
+  %v = insertelement <vscale x 4 x i32> poison, i32 %l, i32 0
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 2 x i64> @loadnxv2i64_offset(ptr %p) {
+; CHECK-LABEL: loadnxv2i64_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur x8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load i64, ptr %g
+  %v = insertelement <vscale x 2 x i64> poison, i64 %l, i32 0
+  ret <vscale x 2 x i64> %v
+}
+
+
+define <vscale x 4 x half> @loadnxv4f16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv4f16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load half, ptr %g
+  %v = insertelement <vscale x 4 x half> poison, half %l, i32 0
+  ret <vscale x 4 x half> %v
+}
+
+define <vscale x 8 x half> @loadnxv8f16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv8f16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load half, ptr %g
+  %v = insertelement <vscale x 8 x half> poison, half %l, i32 0
+  ret <vscale x 8 x half> %v
+}
+
+define <vscale x 4 x bfloat> @loadnxv4bf16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv4bf16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load bfloat, ptr %g
+  %v = insertelement <vscale x 4 x bfloat> poison, bfloat %l, i32 0
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @loadnxv8bf16_offset(ptr %p) {
+; CHECK-LABEL: loadnxv8bf16_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load bfloat, ptr %g
+  %v = insertelement <vscale x 8 x bfloat> poison, bfloat %l, i32 0
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 2 x float> @loadnxv2f32_offset(ptr %p) {
+; CHECK-LABEL: loadnxv2f32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load float, ptr %g
+  %v = insertelement <vscale x 2 x float> poison, float %l, i32 0
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 4 x float> @loadnxv4f32_offset(ptr %p) {
+; CHECK-LABEL: loadnxv4f32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load float, ptr %g
+  %v = insertelement <vscale x 4 x float> poison, float %l, i32 0
+  ret <vscale x 4 x float> %v
+}
+
+define <vscale x 2 x double> @loadnxv2f64_offset(ptr %p) {
+; CHECK-LABEL: loadnxv2f64_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 1
+  %l = load double, ptr %g
+  %v = insertelement <vscale x 2 x double> poison, double %l, i32 0
+  ret <vscale x 2 x double> %v
+}
diff --git a/llvm/test/CodeGen/AArch64/load-insert-zero.ll b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
index 23d545459295f..ccbd6f03fbcc3 100644
--- a/llvm/test/CodeGen/AArch64/load-insert-zero.ll
+++ b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
@@ -378,7 +378,6 @@ define <2 x i64> @loadv2i64_noffset(ptr %p) {
   ret <2 x i64> %v
 }
 
-
 define <4 x half> @loadv4f16_noffset(ptr %p) {
 ; CHECK-LABEL: loadv4f16_noffset:
 ; CHECK:       // %bb.0:
@@ -457,6 +456,328 @@ define <2 x double> @loadv2f64_noffset(ptr %p) {
 }
 
 
+; ROW addressing modes
+
+define <8 x i8> @loadv8i8_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8i8_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, w1, sxtw]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i32 %o
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv16i8_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, w1, sxtw]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i32 %o
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4i16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i32 %o
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8i16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i32 %o
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2i32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, w1, sxtw #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i32 %o
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4i32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, w1, sxtw #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i32 %o
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2i64_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, w1, sxtw #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i32 %o
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+define <4 x half> @loadv4f16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4f16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i32 %o
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8f16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i32 %o
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4bf16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i32 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv8bf16_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i32 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2f32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, w1, sxtw #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i32 %o
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv4f32_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, w1, sxtw #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i32 %o
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_roW(ptr %p, i32 %o) {
+; CHECK-LABEL: loadv2f64_roW:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, w1, sxtw #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds double, ptr %p, i32 %o
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
+; roX
+
+define <8 x i8> @loadv8i8_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8i8_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 %o
+  %l = load i8, ptr %g
+  %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
+  ret <8 x i8> %v
+}
+
+define <16 x i8> @loadv16i8_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv16i8_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i8, ptr %p, i64 %o
+  %l = load i8, ptr %g
+  %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
+  ret <16 x i8> %v
+}
+
+define <4 x i16> @loadv4i16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4i16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i64 %o
+  %l = load i16, ptr %g
+  %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
+  ret <4 x i16> %v
+}
+
+define <8 x i16> @loadv8i16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8i16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i16, ptr %p, i64 %o
+  %l = load i16, ptr %g
+  %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
+  ret <8 x i16> %v
+}
+
+define <2 x i32> @loadv2i32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2i32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, x1, lsl #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i64 %o
+  %l = load i32, ptr %g
+  %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @loadv4i32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4i32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, x1, lsl #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i64 %o
+  %l = load i32, ptr %g
+  %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
+  ret <4 x i32> %v
+}
+
+define <2 x i64> @loadv2i64_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2i64_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i64 %o
+  %l = load i64, ptr %g
+  %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
+  ret <2 x i64> %v
+}
+
+define <4 x half> @loadv4f16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4f16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i64 %o
+  %l = load half, ptr %g
+  %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
+  ret <4 x half> %v
+}
+
+define <8 x half> @loadv8f16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8f16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds half, ptr %p, i64 %o
+  %l = load half, ptr %g
+  %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
+  ret <8 x half> %v
+}
+
+define <4 x bfloat> @loadv4bf16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4bf16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i64 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @loadv8bf16_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv8bf16_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds bfloat, ptr %p, i64 %o
+  %l = load bfloat, ptr %g
+  %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
+  ret <8 x bfloat> %v
+}
+
+define <2 x float> @loadv2f32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2f32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, x1, lsl #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i64 %o
+  %l = load float, ptr %g
+  %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
+  ret <2 x float> %v
+}
+
+define <4 x float> @loadv4f32_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv4f32_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, x1, lsl #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds float, ptr %p, i64 %o
+  %l = load float, ptr %g
+  %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
+  ret <4 x float> %v
+}
+
+define <2 x double> @loadv2f64_roX(ptr %p, i64 %o) {
+; CHECK-LABEL: loadv2f64_roX:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds double, ptr %p, i64 %o
+  %l = load double, ptr %g
+  %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
+  ret <2 x double> %v
+}
+
+
 define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) {
 ; CHECK-LABEL: predictor_4x4_neon:
 ; CHECK:       // %bb.0:

From b4bb2f8aef01aeab8bf4fd164ed14a2c083d2858 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 11 Sep 2024 16:46:12 +0800
Subject: [PATCH 070/114]  [LoopDeletion] Unblock loop deletion with
 `llvm.experimental.noalias.scope.decl` (#108144)

Since `llvm.experimental.noalias.scope.decl` is marked as
`memory(inaccessiblemem: readwrite)`, we cannot treat this annotation
intrinsic as having no side effects. It will block loop deletion when
this intrinsic exists inside a dead loop:

https://github.com/llvm/llvm-project/blob/3dad29b677e427bf69c035605a16efd065576829/llvm/lib/Transforms/Scalar/LoopDeletion.cpp#L103-L110

This patch marks `llvm.experimental.noalias.scope.decl` as droppable to
address the issue.

Fixes https://github.com/llvm/llvm-project/issues/108052.
---
 llvm/lib/IR/User.cpp                         | 12 ++++++++-
 llvm/test/Transforms/LoopDeletion/noalias.ll | 28 ++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopDeletion/noalias.ll

diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp
index 637af7aaa2453..00dd9c72c469c 100644
--- a/llvm/lib/IR/User.cpp
+++ b/llvm/lib/IR/User.cpp
@@ -113,7 +113,17 @@ MutableArrayRef<uint8_t> User::getDescriptor() {
 }
 
 bool User::isDroppable() const {
-  return isa<AssumeInst>(this) || isa<PseudoProbeInst>(this);
+  if (auto *II = dyn_cast<IntrinsicInst>(this)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::assume:
+    case Intrinsic::pseudoprobe:
+    case Intrinsic::experimental_noalias_scope_decl:
+      return true;
+    }
+  }
+  return false;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/Transforms/LoopDeletion/noalias.ll b/llvm/test/Transforms/LoopDeletion/noalias.ll
new file mode 100644
index 0000000000000..0f3b71df94270
--- /dev/null
+++ b/llvm/test/Transforms/LoopDeletion/noalias.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-deletion -S | FileCheck %s
+
+define void @pr108052(i64 %n) {
+; CHECK-LABEL: define void @pr108052(
+; CHECK-SAME: i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK:       [[FOR_EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.exit:
+  ret void
+
+for.body:
+  %indvar = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  %inc = add nuw i64 %indvar, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.exit, label %for.body
+}
+
+!0 = !{!1}
+!1 = distinct !{!1, !2, !"x: %a"}
+!2 = distinct !{!2, !"x"}

From 596e7ccd30655faae9c4f35a1913dc23d08f3857 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Wed, 11 Sep 2024 10:04:56 +0100
Subject: [PATCH 071/114] [RISCV][doc] Add note to RISCVUsage about supported
 atomics ABIs (#103879)

I've tried to avoid giving too much detailed explanation as the psABI docs are the
better source for this.
---
 llvm/docs/RISCVUsage.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 8846b82fcaea5..a15af9adfa945 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -281,6 +281,13 @@ Supported
 ``Za128rs``, ``Za64rs``, ``Zama16b``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare``
   These extensions are defined as part of the `RISC-V Profiles specification <https://github.com/riscv/riscv-profiles/releases/tag/v1.0>`__.  They do not introduce any new features themselves, but instead describe existing hardware features.
 
+Atomics ABIs
+============
+
+At the time of writing there are three atomics mappings (ABIs) `defined for RISC-V <https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#tag_riscv_atomic_abi-14-uleb128version>`__.  As of LLVM 19, LLVM defaults to "A6S", which is compatible with both the original "A6" and the future "A7" ABI. See `the psABI atomics document <https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-atomic.adoc>`__ for more information on these mappings.
+
+Note that although the "A6S" mapping is used, the ELF attribute recording the mapping isn't currently emitted by default due to a bug causing a crash in older versions of binutils when processing files containing this attribute.
+
 Experimental Extensions
 =======================
 

From a8f3d303122d049e65b699870615d464b77b489f Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Wed, 11 Sep 2024 17:08:44 +0800
Subject: [PATCH 072/114] [mlir] Add dependent TensorDialect to
 ConvertVectorToLLVM pass (#108045)

This patch registers the tensor dialect as dependent of the
ConvertVectorToLLVM.
This which fixes a crash when `vector.transfer_write` is used with
dynamic tensor type.
The MaterializeTransferMask pattern would call
`vector::createOrFoldDimOp` which
creates a `tensor.dim` operation.

Fixes #107805.
---
 .../VectorToLLVM/ConvertVectorToLLVMPass.cpp           |  2 ++
 mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir  | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
index 842d239cf6a51..4623b9667998c 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/X86Vector/Transforms.h"
@@ -45,6 +46,7 @@ struct ConvertVectorToLLVMPass
     registry.insert<LLVM::LLVMDialect>();
     registry.insert<arith::ArithDialect>();
     registry.insert<memref::MemRefDialect>();
+    registry.insert<tensor::TensorDialect>();
     if (armNeon)
       registry.insert<arm_neon::ArmNeonDialect>();
     if (armSVE)
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 7ac49c5f02347..bd14823cea50a 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2521,6 +2521,16 @@ func.func @transfer_write_1d_scalable_mask(%arg0: memref<1x?xf32>, %vec: vector<
 
 // -----
 
+// CHECK-LABEL: func @transfer_write_tensor
+//       CHECK:   vector.transfer_write
+func.func @transfer_write_tensor(%arg0: vector<4xf32>,%arg1: tensor<?xf32>) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = vector.transfer_write %arg0, %arg1[%c0] : vector<4xf32>, tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
 func.func @genbool_0d_f() -> vector<i1> {
   %0 = vector.constant_mask [0] : vector<i1>
   return %0 : vector<i1>

From a4b0153c4f5f0d6bcf42fb2cb97dbdfad9c59e2c Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Wed, 11 Sep 2024 17:10:58 +0800
Subject: [PATCH 073/114] [mlir][vector] Support for extracting 1-element
 vectors in VectorExtractOpConversion (#107549)

This patch adds support for converting `vector.extract` that extract
1-element vectors into LLVM, fixing a crash in such cases.
E.g., `vector.extract %1[0]: vector<1xf32> from vector<2xf32>`. Fix
#61372.
---
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  5 +++-
 .../VectorToLLVM/vector-to-llvm.mlir          | 24 +++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 53e18a2e9d299..687061e9988f8 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1104,7 +1104,10 @@ class VectorExtractOpConversion
     }
 
     // One-shot extraction of vector from array (only requires extractvalue).
-    if (isa<VectorType>(resultType)) {
+    // Except for extracting 1-element vectors.
+    if (isa<VectorType>(resultType) &&
+        position.size() !=
+            static_cast<size_t>(extractOp.getSourceVectorType().getRank())) {
       if (extractOp.hasDynamicPosition())
         return failure();
 
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index bd14823cea50a..2fe9ba8fead17 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -1130,6 +1130,30 @@ func.func @extract_scalar_from_vec_1d_f32_scalable(%arg0: vector<[16]xf32>) -> f
 
 // -----
 
+func.func @extract_vec_1e_from_vec_1d_f32(%arg0: vector<16xf32>) -> vector<1xf32> {
+  %0 = vector.extract %arg0[15]: vector<1xf32> from vector<16xf32>
+  return %0 : vector<1xf32>
+}
+// CHECK-LABEL: @extract_vec_1e_from_vec_1d_f32(
+//  CHECK-SAME:   %[[A:.*]]: vector<16xf32>)
+//       CHECK:   %[[T0:.*]] = llvm.mlir.constant(15 : i64) : i64
+//       CHECK:   %[[T1:.*]] = llvm.extractelement %[[A]][%[[T0]] : i64] : vector<16xf32>
+//       CHECK:   %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : f32 to vector<1xf32>
+//       CHECK:   return %[[T2]] : vector<1xf32>
+
+func.func @extract_vec_1e_from_vec_1d_f32_scalable(%arg0: vector<[16]xf32>) -> vector<1xf32> {
+  %0 = vector.extract %arg0[15]: vector<1xf32> from vector<[16]xf32>
+  return %0 : vector<1xf32>
+}
+// CHECK-LABEL: @extract_vec_1e_from_vec_1d_f32_scalable(
+//  CHECK-SAME:   %[[A:.*]]: vector<[16]xf32>)
+//       CHECK:   %[[T0:.*]] = llvm.mlir.constant(15 : i64) : i64
+//       CHECK:   %[[T1:.*]] = llvm.extractelement %[[A]][%[[T0]] : i64] : vector<[16]xf32>
+//       CHECK:   %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : f32 to vector<1xf32>
+//       CHECK:   return %[[T2]] : vector<1xf32>
+
+// -----
+
 func.func @extract_scalar_from_vec_1d_index(%arg0: vector<16xindex>) -> index {
   %0 = vector.extract %arg0[15]: index from vector<16xindex>
   return %0 : index

From f4dd1bc8fc625d3938f95b9d06aaaeebd2e90dca Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 11 Sep 2024 10:23:41 +0100
Subject: [PATCH 074/114] [AMDGPU] Fix leak and self-assignment in copy
 assignment operator (#107847)

A static analyzer identified that this operator was unsafe in the case
of self-assignment.

In the placement new statement, StringValue's copy constructor was being
implicitly called, which received a reference to "itself". In fact, it
was being passed an old StringValue at the same address - one whose
lifetime had already ended. The copy constructor was thus copying fields
from a dead object.

We need to be careful when switching active union members, and calling
the destructor on the old StringValue will avoid memory leaks which I
believe the old code exhibited.
---
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7af5e7388f841..4cc60f5097899 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -100,19 +100,25 @@ struct SIArgument {
   SIArgument() : IsRegister(false), StackOffset(0) {}
   SIArgument(const SIArgument &Other) {
     IsRegister = Other.IsRegister;
-    if (IsRegister) {
-      ::new ((void *)std::addressof(RegisterName))
-          StringValue(Other.RegisterName);
-    } else
+    if (IsRegister)
+      new (&RegisterName) StringValue(Other.RegisterName);
+    else
       StackOffset = Other.StackOffset;
     Mask = Other.Mask;
   }
   SIArgument &operator=(const SIArgument &Other) {
+    // Default-construct or destruct the old RegisterName in case of switching
+    // union members
+    if (IsRegister != Other.IsRegister) {
+      if (Other.IsRegister)
+        new (&RegisterName) StringValue();
+      else
+        RegisterName.~StringValue();
+    }
     IsRegister = Other.IsRegister;
-    if (IsRegister) {
-      ::new ((void *)std::addressof(RegisterName))
-          StringValue(Other.RegisterName);
-    } else
+    if (IsRegister)
+      RegisterName = Other.RegisterName;
+    else
       StackOffset = Other.StackOffset;
     Mask = Other.Mask;
     return *this;

From 2e4e918bf03868bb4cd0d0415766cfba8dc1d899 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Wed, 11 Sep 2024 11:27:30 +0200
Subject: [PATCH 075/114] [bazel] Port bc152fbf43157659f8b6817e8510e1fbe6e175b5

---
 .../llvm-project-overlay/llvm/BUILD.bazel     | 25 ++++++++++++++++---
 .../llvm-project-overlay/llvm/driver.bzl      |  1 +
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 4df7954ea3440..2af3fb40507ed 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -3690,22 +3690,41 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "llvm-debuginfod-find",
+gentbl(
+    name = "DebugInfodFindOptsTableGen",
+    strip_include_prefix = "tools/llvm-debuginfod-find",
+    tbl_outs = [(
+        "-gen-opt-parser-defs",
+        "tools/llvm-debuginfod-find/Opts.inc",
+    )],
+    tblgen = ":llvm-tblgen",
+    td_file = "tools/llvm-debuginfod-find/Opts.td",
+    td_srcs = ["include/llvm/Option/OptParser.td"],
+)
+
+cc_library(
+    name = "llvm-debuginfod-find-lib",
     srcs = glob([
         "tools/llvm-debuginfod-find/*.cpp",
     ]),
     copts = llvm_copts,
-    stamp = 0,
     deps = [
         ":BitReader",
         ":Core",
+        ":DebugInfodFindOptsTableGen",
         ":Debuginfod",
+        ":Option",
         ":Support",
         ":Symbolize",
     ],
 )
 
+llvm_driver_cc_binary(
+    name = "llvm-debuginfod-find",
+    stamp = 0,
+    deps = [":llvm-debuginfod-find-lib"],
+)
+
 cc_binary(
     name = "llvm-dis",
     srcs = glob([
diff --git a/utils/bazel/llvm-project-overlay/llvm/driver.bzl b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
index b3d3b2eed9f06..66e8af7db7d0e 100644
--- a/utils/bazel/llvm-project-overlay/llvm/driver.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
@@ -16,6 +16,7 @@ _TOOLS = {
     "llvm-ar": "//llvm:llvm-ar-lib",
     "llvm-cgdata": "//llvm:llvm-cgdata-lib",
     "llvm-cxxfilt": "//llvm:llvm-cxxfilt-lib",
+    "llvm-debuginfod-find": "//llvm:llvm-debuginfod-find-lib",
     "llvm-dwp": "//llvm:llvm-dwp-lib",
     "llvm-gsymutil": "//llvm:llvm-gsymutil-lib",
     "llvm-ifs": "//llvm:llvm-ifs-lib",

From c4a00be08aa10b5e51ee5db426d61ac87d9dc6fd Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Wed, 11 Sep 2024 11:36:59 +0200
Subject: [PATCH 076/114] [bazel] Add missing dependency for
 a8f3d303122d049e65b699870615d464b77b489f

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 5ee0ee5108276..b43bdb7b5f471 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -11780,6 +11780,7 @@ cc_library(
         ":MaskableOpInterface",
         ":MemRefDialect",
         ":Pass",
+        ":TensorDialect",
         ":ToLLVMIRTranslation",
         ":TransformUtils",
         ":VectorDialect",

From 935b9f6274b39b35f6b391aaf4c87c0605421fb3 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 11 Sep 2024 10:39:34 +0100
Subject: [PATCH 077/114] [AMDGPU] Make use of multiclass inheritance. NFC.

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td   | 16 ++++++----------
 llvm/lib/Target/AMDGPU/BUFInstructions.td  |  5 ++---
 llvm/lib/Target/AMDGPU/VOP1Instructions.td |  5 ++---
 llvm/lib/Target/AMDGPU/VOP2Instructions.td |  5 ++---
 4 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2085113992ad1..e20c26eb83787 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1026,18 +1026,14 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
         }
   }
 
-  multiclass AMDGPUImageDimAtomic<string opmod, LLVMType rettype = llvm_anyint_ty> {
-    defm ""
-        : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], rettype>;
-  }
+  multiclass AMDGPUImageDimAtomic<string opmod, LLVMType rettype = llvm_anyint_ty> :
+    AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], rettype>;
 
-  multiclass AMDGPUImageDimFloatAtomic<string opmod> {
-    defm "" : AMDGPUImageDimAtomic<opmod, llvm_anyfloat_ty>;
-  }
+  multiclass AMDGPUImageDimFloatAtomic<string opmod> :
+    AMDGPUImageDimAtomic<opmod, llvm_anyfloat_ty>;
 
-  multiclass AMDGPUImageDimAnyAtomic<string opmod> {
-    defm "" : AMDGPUImageDimAtomic<opmod, llvm_any_ty>;
-  }
+  multiclass AMDGPUImageDimAnyAtomic<string opmod> :
+    AMDGPUImageDimAtomic<opmod, llvm_any_ty>;
 
   defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAnyAtomic<"ATOMIC_SWAP">;
   defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index c6668b24f4ef6..532ece8b16c5e 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1583,9 +1583,8 @@ multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isInt
   defm : BufferAtomicPat_Common<OpPrefix, vt, Inst # "_VBUFFER", isIntr>;
 }
 
-multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> {
-  defm : BufferAtomicPat<OpPrefix, vt, Inst, /* isIntr */ 1>;
-}
+multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> :
+  BufferAtomicPat<OpPrefix, vt, Inst, /* isIntr */ 1>;
 
 multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> {
   foreach RtnMode = ["ret", "noret"] in {
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 03e4cb9fcf49b..8e5b61e8e492e 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -882,9 +882,8 @@ multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName,
   }
 }
 
-multiclass VOP1_Realtriple_e64<GFXGen Gen, bits<9> op> {
-  defm NAME : VOP3_Realtriple<Gen, {0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>;
-}
+multiclass VOP1_Realtriple_e64<GFXGen Gen, bits<9> op> :
+  VOP3_Realtriple<Gen, {0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>;
 
 multiclass VOP1_Realtriple_e64_with_name<GFXGen Gen, bits<9> op, string opName,
   string asmName> {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index fccaa27f36138..afae7a886288c 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1513,9 +1513,8 @@ multiclass VOP2be_Real_dpp8<GFXGen Gen, bits<6> op, string opName, string asmNam
 }
 
 // We don't want to override separate decoderNamespaces within these
-multiclass VOP2_Realtriple_e64<GFXGen Gen, bits<6> op> {
-  defm NAME : VOP3_Realtriple<Gen, {0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, NAME> ;
-}
+multiclass VOP2_Realtriple_e64<GFXGen Gen, bits<6> op> :
+  VOP3_Realtriple<Gen, {0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, NAME>;
 
 multiclass VOP2_Realtriple_e64_with_name<GFXGen Gen, bits<6> op, string opName,
                                                string asmName> {

From 704116373ae91a1b829dc3d3d269874fb27b579c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 11 Sep 2024 11:06:55 +0100
Subject: [PATCH 078/114] [AMDGPU] Regenerate buffer intrinsic tests with
 update_llc_test_checks. NFC.

---
 ...m.amdgcn.raw.ptr.buffer.load.format.d16.ll |  76 +++--
 ...mdgcn.struct.ptr.buffer.load.format.d16.ll | 100 +++++--
 .../llvm.amdgcn.struct.ptr.buffer.load.ll     | 282 +++++++++++-------
 ...dgcn.struct.ptr.buffer.store.format.d16.ll | 152 ++++++----
 ...m.amdgcn.struct.ptr.buffer.store.format.ll | 102 ++++---
 .../llvm.amdgcn.struct.ptr.buffer.store.ll    | 209 ++++++++-----
 6 files changed, 611 insertions(+), 310 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll
index c27118446cc2f..cafd903df2d56 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll
@@ -1,47 +1,79 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_x:
-; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0
 define amdgpu_ps half @buffer_load_format_d16_x(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_x:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    buffer_load_format_d16_x v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_x:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    buffer_load_format_d16_x v0, off, s[0:3], 0
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call half @llvm.amdgcn.raw.ptr.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   ret half %data
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xy:
-; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
 define amdgpu_ps half @buffer_load_format_d16_xy(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xy:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    buffer_load_format_d16_xy v[0:1], off, s[0:3], 0 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xy:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    buffer_load_format_d16_xy v0, off, s[0:3], 0
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   %elt = extractelement <2 x half> %data, i32 1
   ret half %elt
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
-; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 define amdgpu_ps half @buffer_load_format_d16_xyz(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xyz:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    buffer_load_format_d16_xyz v[0:2], off, s[0:3], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v2 ; encoding: [0x02,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xyz:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    buffer_load_format_d16_xyz v[0:1], off, s[0:3], 0
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_mov_b32_e32 v0, v1
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <3 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   %elt = extractelement <3 x half> %data, i32 2
   ret half %elt
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
-; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
 define amdgpu_ps half @buffer_load_format_d16_xyzw(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xyzw:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    buffer_load_format_d16_xyzw v[0:3], off, s[0:3], 0 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xyzw:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    buffer_load_format_d16_xyzw v[0:1], off, s[0:3], 0
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   %elt = extractelement <4 x half> %data, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll
index 3a396b54f89ab..39df6ec679e88 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll
@@ -1,57 +1,107 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_x:
-; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_ps half @buffer_load_format_d16_x(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_x:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; UNPACKED-NEXT:    buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x20,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_x:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    v_mov_b32_e32 v0, 0
+; PACKED-NEXT:    buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call half @llvm.amdgcn.struct.ptr.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   ret half %data
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xy:
-; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
 define amdgpu_ps half @buffer_load_format_d16_xy(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xy:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; UNPACKED-NEXT:    buffer_load_format_d16_xy v[0:1], v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x24,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xy:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    v_mov_b32_e32 v0, 0
+; PACKED-NEXT:    buffer_load_format_d16_xy v0, v0, s[0:3], 0 idxen
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %elt = extractelement <2 x half> %data, i32 1
   ret half %elt
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
-; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 define amdgpu_ps half @buffer_load_format_d16_xyz(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xyz:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; UNPACKED-NEXT:    buffer_load_format_d16_xyz v[0:2], v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x28,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v2 ; encoding: [0x02,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xyz:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    v_mov_b32_e32 v0, 0
+; PACKED-NEXT:    buffer_load_format_d16_xyz v[0:1], v0, s[0:3], 0 idxen
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_mov_b32_e32 v0, v1
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <3 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %elt = extractelement <3 x half> %data, i32 2
   ret half %elt
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
-; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
 define amdgpu_ps half @buffer_load_format_d16_xyzw(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_d16_xyzw:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; UNPACKED-NEXT:    buffer_load_format_d16_xyzw v[0:3], v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x2c,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_d16_xyzw:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    v_mov_b32_e32 v0, 0
+; PACKED-NEXT:    buffer_load_format_d16_xyzw v[0:1], v0, s[0:3], 0 idxen
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %elt = extractelement <4 x half> %data, i32 3
   ret half %elt
 }
 
-; GCN-LABEL: {{^}}buffer_load_format_i16_x:
-; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_ps half @buffer_load_format_i16_x(ptr addrspace(8) inreg %rsrc) {
+; UNPACKED-LABEL: buffer_load_format_i16_x:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; UNPACKED-NEXT:    buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x20,0xe0,0x00,0x00,0x00,0x80]
+; UNPACKED-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; UNPACKED-NEXT:    ; return to shader part epilog
+;
+; PACKED-LABEL: buffer_load_format_i16_x:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    v_mov_b32_e32 v0, 0
+; PACKED-NEXT:    buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call i16 @llvm.amdgcn.struct.ptr.buffer.load.format.i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %fdata = bitcast i16 %data to half
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
index 2f9e6b0a1cf52..55600cab8432b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
@@ -1,12 +1,16 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,SI
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,VI
 
-;CHECK-LABEL: {{^}}buffer_load:
-;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_load_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
-;CHECK: s_waitcnt
 define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v8, 0
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v8, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_load_dwordx4 v[4:7], v8, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_dwordx4 v[8:11], v8, s[0:3], 0 idxen slc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
   %data_glc = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -17,106 +21,165 @@ main_body:
   ret {<4 x float>, <4 x float>, <4 x float>} %r2
 }
 
-;CHECK-LABEL: {{^}}buffer_load_immoffs:
-;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_immoffs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 40, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
-;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
-;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_immoffs_large:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    s_movk_i32 s4, 0x1ffc
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[0:3], s4 idxen offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 4, i32 8188, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_idx:
-;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_idx:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_ofs:
-;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
-;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs_imm:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %ofs = add i32 %1, 60
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_both:
-;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_both_reversed:
-;CHECK: v_mov_b32_e32 v2, v0
-;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both_reversed:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_x1:
-;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
 define amdgpu_ps float @buffer_load_x1(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x1:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   ret float %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_x2:
-;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
 define amdgpu_ps <2 x float> @buffer_load_x2(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x2:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   ret <2 x float> %data
 }
 
-;CHECK-LABEL: {{^}}buffer_load_negative_offset:
-;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, -16, v0
-;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen
 define amdgpu_ps <4 x float> @buffer_load_negative_offset(ptr addrspace(8) inreg, i32 %ofs) {
+; SI-LABEL: buffer_load_negative_offset:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_mov_b32 s4, 0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, -16, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: buffer_load_negative_offset:
+; VI:       ; %bb.0: ; %main_body
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    ; return to shader part epilog
 main_body:
   %ofs.1 = add i32 %ofs, -16
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs.1, i32 0, i32 0)
   ret <4 x float> %data
 }
 
-; SI won't merge ds memory operations, because of the signed offset bug, so
-; we only have check lines for VI.
-; CHECK-LABEL: buffer_load_mmo:
-; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
+; SI won't merge ds memory operations, because of the signed offset bug.
 define amdgpu_ps float @buffer_load_mmo(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %lds) {
+; SI-LABEL: buffer_load_mmo:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 idxen
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_write_b32 v0, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
+; SI-NEXT:    ds_write_b32 v0, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: buffer_load_mmo:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 idxen
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    ds_write2_b32 v0, v2, v2 offset1:4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, v1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ; return to shader part epilog
 entry:
   store float 0.0, ptr addrspace(3) %lds
   %val = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -125,12 +188,15 @@ entry:
   ret float %val
 }
 
-;CHECK-LABEL: {{^}}buffer_load_int:
-;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_load_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
-;CHECK: s_waitcnt
 define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_int:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_load_dwordx2 v[4:5], v6, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 idxen slc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x i32> @llvm.amdgcn.struct.ptr.buffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
   %data_glc = call <2 x i32> @llvm.amdgcn.struct.ptr.buffer.load.v2i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -144,13 +210,13 @@ main_body:
   ret {<4 x float>, <2 x float>, float} %r2
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_ubyte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
 define amdgpu_ps float @struct_ptr_buffer_load_ubyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_ubyte:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %tmp = call i8 @llvm.amdgcn.struct.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = zext i8 %tmp to i32
@@ -158,13 +224,13 @@ main_body:
   ret float %val
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_ushort:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
 define amdgpu_ps float @struct_ptr_buffer_load_ushort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_ushort:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %tmp = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = zext i16 %tmp to i32
@@ -172,13 +238,13 @@ main_body:
   ret float %val
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_sbyte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
 define amdgpu_ps float @struct_ptr_buffer_load_sbyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_sbyte:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_sbyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %tmp = call i8 @llvm.amdgcn.struct.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = sext i8 %tmp to i32
@@ -186,13 +252,13 @@ main_body:
   ret float %val
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_sshort:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
 define amdgpu_ps float @struct_ptr_buffer_load_sshort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_sshort:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_sshort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %tmp = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = sext i16 %tmp to i32
@@ -200,72 +266,84 @@ main_body:
   ret float %val
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_f16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b16 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call half @llvm.amdgcn.struct.ptr.buffer.load.f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store half %val, ptr addrspace(3) %ptr
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v2f16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b32 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v2f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b32 v0, v1
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <2 x half> %val, ptr addrspace(3) %ptr
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v4f16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b64 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v4f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[1:2]
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <4 x half> %val, ptr addrspace(3) %ptr
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_i16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b16 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store i16 %val, ptr addrspace(3) %ptr
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v2i16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b32 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v2i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b32 v0, v1
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call <2 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v2i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <2 x i16> %val, ptr addrspace(3) %ptr
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v4i16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: ds_write_b64 v0, [[VAL]]
 define amdgpu_ps void @struct_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v4i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[1:2]
+; CHECK-NEXT:    s_endpgm
 main_body:
   %val = call <4 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <4 x i16> %val, ptr addrspace(3) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
index 8109fca4a043a..58b422dd6a751 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
@@ -1,85 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
 
-; GCN-LABEL: {{^}}buffer_store_format_d16_x:
-; GCN: s_load_dword s[[LO:[0-9]+]]
-; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
-; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_x(ptr addrspace(8) %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) {
+; GCN-LABEL: buffer_store_format_d16_x:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_load_dword s4, s[6:7], 0x30
+; GCN-NEXT:    s_load_dword s5, s[6:7], 0x54
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_format_d16_x v0, v1, s[0:3], 0 idxen
+; GCN-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.f16(half %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-; GCN-LABEL: {{^}}buffer_store_format_d16_xy:
-
-; UNPACKED: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}}
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-
-; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %index) {
+; UNPACKED-LABEL: buffer_store_format_d16_xy:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; UNPACKED-NEXT:    s_lshr_b32 s6, s4, 16
+; UNPACKED-NEXT:    s_and_b32 s4, s4, 0xffff
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, s4
+; UNPACKED-NEXT:    v_mov_b32_e32 v1, s6
+; UNPACKED-NEXT:    v_mov_b32_e32 v2, s5
+; UNPACKED-NEXT:    buffer_store_format_d16_xy v[0:1], v2, s[0:3], 0 idxen
+; UNPACKED-NEXT:    s_endpgm
+;
+; PACKED-LABEL: buffer_store_format_d16_xy:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-NEXT:    v_mov_b32_e32 v0, s4
+; PACKED-NEXT:    v_mov_b32_e32 v1, s5
+; PACKED-NEXT:    buffer_store_format_d16_xy v0, v1, s[0:3], 0 idxen
+; PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-; GCN-LABEL: {{^}}buffer_store_format_d16_xyz:
-; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-
-; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-
-; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
-
-; UNPACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-
-; PACKED: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
-; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]]
-
-; PACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) {
+; UNPACKED-LABEL: buffer_store_format_d16_xyz:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; UNPACKED-NEXT:    s_load_dword s6, s[6:7], 0x18
+; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
+; UNPACKED-NEXT:    s_lshr_b32 s7, s4, 16
+; UNPACKED-NEXT:    s_and_b32 s4, s4, 0xffff
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, s4
+; UNPACKED-NEXT:    v_mov_b32_e32 v1, s7
+; UNPACKED-NEXT:    v_mov_b32_e32 v2, s5
+; UNPACKED-NEXT:    v_mov_b32_e32 v3, s6
+; UNPACKED-NEXT:    buffer_store_format_d16_xyz v[0:2], v3, s[0:3], 0 idxen
+; UNPACKED-NEXT:    s_endpgm
+;
+; PACKED-LABEL: buffer_store_format_d16_xyz:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; PACKED-NEXT:    s_load_dword s8, s[6:7], 0x18
+; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
+; PACKED-NEXT:    v_mov_b32_e32 v0, s4
+; PACKED-NEXT:    v_mov_b32_e32 v1, s5
+; PACKED-NEXT:    v_mov_b32_e32 v2, s8
+; PACKED-NEXT:    buffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 idxen
+; PACKED-NEXT:    s_endpgm
 main_body:
   %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v3f16(<3 x half> %data_subvec, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-
-; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
-; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-
-; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-
-; UNPACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-
-; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
-; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
-
-; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) {
+; UNPACKED-LABEL: buffer_store_format_d16_xyzw:
+; UNPACKED:       ; %bb.0: ; %main_body
+; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; UNPACKED-NEXT:    s_load_dword s6, s[6:7], 0x18
+; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; UNPACKED-NEXT:    s_lshr_b32 s7, s5, 16
+; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
+; UNPACKED-NEXT:    s_lshr_b32 s8, s4, 16
+; UNPACKED-NEXT:    s_and_b32 s4, s4, 0xffff
+; UNPACKED-NEXT:    v_mov_b32_e32 v0, s4
+; UNPACKED-NEXT:    v_mov_b32_e32 v1, s8
+; UNPACKED-NEXT:    v_mov_b32_e32 v2, s5
+; UNPACKED-NEXT:    v_mov_b32_e32 v3, s7
+; UNPACKED-NEXT:    v_mov_b32_e32 v4, s6
+; UNPACKED-NEXT:    buffer_store_format_d16_xyzw v[0:3], v4, s[0:3], 0 idxen
+; UNPACKED-NEXT:    s_endpgm
+;
+; PACKED-LABEL: buffer_store_format_d16_xyzw:
+; PACKED:       ; %bb.0: ; %main_body
+; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; PACKED-NEXT:    s_load_dword s8, s[6:7], 0x18
+; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-NEXT:    v_mov_b32_e32 v0, s4
+; PACKED-NEXT:    v_mov_b32_e32 v1, s5
+; PACKED-NEXT:    v_mov_b32_e32 v2, s8
+; PACKED-NEXT:    buffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 idxen
+; PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-; GCN-LABEL: {{^}}buffer_store_format_i16_x:
-; GCN: s_load_dword s[[LO:[0-9]+]]
-; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
-; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_i16_x(ptr addrspace(8) %rsrc, [8 x i32], i16 %data, [8 x i32], i32 %index) {
+; GCN-LABEL: buffer_store_format_i16_x:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_load_dword s4, s[6:7], 0x30
+; GCN-NEXT:    s_load_dword s5, s[6:7], 0x54
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_format_d16_x v0, v1, s[0:3], 0 idxen
+; GCN-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.i16(i16 %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll
index 13217b24dcd4b..61a08d96986b0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll
@@ -1,12 +1,15 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=CHECK,SI %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VI %s
 
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_store_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_store_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
 define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
+; CHECK-LABEL: buffer_store:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v12, 0
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v12, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_store_format_xyzw v[4:7], v12, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_store_format_xyzw v[8:11], v12, s[0:3], 0 idxen slc
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -14,47 +17,56 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
 define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) {
+; CHECK-LABEL: buffer_store_immoffs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen offset:42
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_idx:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float>, i32) {
+; CHECK-LABEL: buffer_store_idx:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) {
+; CHECK-LABEL: buffer_store_ofs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v4
+; CHECK-NEXT:    v_mov_b32_e32 v4, s4
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 %2, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_both:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float>, i32, i32) {
+; CHECK-LABEL: buffer_store_both:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 %3, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_both_reversed:
-;CHECK: v_mov_b32_e32 v6, v4
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_both_reversed(ptr addrspace(8) inreg, <4 x float>, i32, i32) {
+; CHECK-LABEL: buffer_store_both_reversed:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %3, i32 %2, i32 0, i32 0)
   ret void
@@ -62,14 +74,23 @@ main_body:
 
 ; Ideally, the register allocator would avoid the wait here
 ;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) {
+; SI-LABEL: buffer_store_wait:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: buffer_store_wait:
+; VI:       ; %bb.0: ; %main_body
+; VI-NEXT:    buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
+; VI-NEXT:    buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0)
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0, i32 0)
@@ -77,28 +98,31 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %index) {
+; CHECK-LABEL: buffer_store_x1:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_format_x v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.f32(float %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1_i32:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x1_i32(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %index) {
+; CHECK-LABEL: buffer_store_x1_i32:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_format_x v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %index) {
+; CHECK-LABEL: buffer_store_x2:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.format.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll
index e52af31360764..d08623f685e85 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll
@@ -1,12 +1,15 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=CHECK,SI %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VI %s
+
 define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
+; CHECK-LABEL: buffer_store:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v12, 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v12, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_store_dwordx4 v[4:7], v12, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_store_dwordx4 v[8:11], v12, s[0:3], 0 idxen slc
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -14,62 +17,79 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
 define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) {
+; CHECK-LABEL: buffer_store_immoffs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen offset:42
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_idx:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float>, i32) {
+; CHECK-LABEL: buffer_store_idx:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) {
+; CHECK-LABEL: buffer_store_ofs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v4
+; CHECK-NEXT:    v_mov_b32_e32 v4, s4
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 %2, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_both:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float>, i32, i32) {
+; CHECK-LABEL: buffer_store_both:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 %3, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_both_reversed:
-;CHECK: v_mov_b32_e32 v6, v4
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_both_reversed(ptr addrspace(8) inreg, <4 x float>, i32, i32) {
+; CHECK-LABEL: buffer_store_both_reversed:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %3, i32 %2, i32 0, i32 0)
   ret void
 }
 
 ; Ideally, the register allocator would avoid the wait here
-;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) {
+; SI-LABEL: buffer_store_wait:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: buffer_store_wait:
+; VI:       ; %bb.0: ; %main_body
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0)
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0, i32 0)
@@ -77,30 +97,34 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %index) {
+; CHECK-LABEL: buffer_store_x1:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %index) #0 {
+; CHECK-LABEL: buffer_store_x2:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_int:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
 define amdgpu_ps void @buffer_store_int(ptr addrspace(8) inreg, <4 x i32>, <2 x i32>, i32) {
+; CHECK-LABEL: buffer_store_int:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v7, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_store_dwordx2 v[4:5], v7, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 idxen slc
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4i32(<4 x i32> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.ptr.buffer.store.v2i32(<2 x i32> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -108,12 +132,12 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_byte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @struct_ptr_buffer_store_byte(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) {
+; CHECK-LABEL: struct_ptr_buffer_store_byte:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    buffer_store_byte v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i8
@@ -121,39 +145,63 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_f16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_f16_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @struct_ptr_buffer_store_f16(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) {
+; CHECK-LABEL: struct_ptr_buffer_store_f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT:    buffer_store_short v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
   %v2 = fptrunc float %v1 to half
   call void @llvm.amdgcn.struct.ptr.buffer.store.f16(half %v2, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_v2f16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_ptr_buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %v1, i32 %index) {
+; SI-LABEL: struct_ptr_buffer_store_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: struct_ptr_buffer_store_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.ptr.buffer.store.v2f16(<2 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_v4f16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_ptr_buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %v1, i32 %index) {
+; SI-LABEL: struct_ptr_buffer_store_v4f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: struct_ptr_buffer_store_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f16(<4 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_i16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @struct_ptr_buffer_store_i16(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) {
+; CHECK-LABEL: struct_ptr_buffer_store_i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    buffer_store_short v0, v1, s[0:3], 0 idxen
+; CHECK-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i16
@@ -161,18 +209,39 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_vif16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_ptr_buffer_store_vif16(ptr addrspace(8) inreg %rsrc, <2 x i16> %v1, i32 %index) {
+; SI-LABEL: struct_ptr_buffer_store_vif16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: struct_ptr_buffer_store_vif16:
+; VI:       ; %bb.0:
+; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.ptr.buffer.store.v2i16(<2 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_ptr_buffer_store_v4i16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_ptr_buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %v1, i32 %index) {
+; SI-LABEL: struct_ptr_buffer_store_v4i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_or_b32_e32 v1, v0, v1
+; SI-NEXT:    buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 idxen
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: struct_ptr_buffer_store_v4i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+; VI-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.ptr.buffer.store.v4i16(<4 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }

From 7e0008d5ad5ea7df0b9586f07e5af4a7225dac96 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Wed, 11 Sep 2024 12:22:36 +0200
Subject: [PATCH 079/114] [LLD][COFF][NFC] Create import thunks in
 ImportFile::parse. (#107929)

---
 lld/COFF/InputFiles.cpp  | 19 +++++++++++++++++--
 lld/COFF/InputFiles.h    |  2 ++
 lld/COFF/SymbolTable.cpp |  4 ++--
 lld/COFF/SymbolTable.h   |  3 ++-
 lld/COFF/Symbols.cpp     | 18 +++---------------
 lld/COFF/Symbols.h       |  2 +-
 6 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index fa2d230075d9d..c7956baf73cf4 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -1009,6 +1009,20 @@ MachineTypes ImportFile::getMachineType() const {
   return MachineTypes(machine);
 }
 
+ImportThunkChunk *ImportFile::makeImportThunk() {
+  switch (hdr->Machine) {
+  case AMD64:
+    return make<ImportThunkChunkX64>(ctx, impSym);
+  case I386:
+    return make<ImportThunkChunkX86>(ctx, impSym);
+  case ARM64:
+    return make<ImportThunkChunkARM64>(ctx, impSym);
+  case ARMNT:
+    return make<ImportThunkChunkARM>(ctx, impSym);
+  }
+  llvm_unreachable("unknown machine type");
+}
+
 void ImportFile::parse() {
   const auto *hdr =
       reinterpret_cast<const coff_import_header *>(mb.getBufferStart());
@@ -1069,9 +1083,10 @@ void ImportFile::parse() {
   // DLL functions just like regular non-DLL functions.)
   if (hdr->getType() == llvm::COFF::IMPORT_CODE) {
     if (ctx.config.machine != ARM64EC) {
-      thunkSym = ctx.symtab.addImportThunk(name, impSym, hdr->Machine);
+      thunkSym = ctx.symtab.addImportThunk(name, impSym, makeImportThunk());
     } else {
-      thunkSym = ctx.symtab.addImportThunk(name, impSym, AMD64);
+      thunkSym = ctx.symtab.addImportThunk(
+          name, impSym, make<ImportThunkChunkX64>(ctx, impSym));
       // FIXME: Add aux IAT symbols.
     }
   }
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index 8b3303a8d87f4..1d55b4f34f754 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -55,6 +55,7 @@ class Defined;
 class DefinedImportData;
 class DefinedImportThunk;
 class DefinedRegular;
+class ImportThunkChunk;
 class SectionChunk;
 class Symbol;
 class Undefined;
@@ -352,6 +353,7 @@ class ImportFile : public InputFile {
 
 private:
   void parse() override;
+  ImportThunkChunk *makeImportThunk();
 
 public:
   StringRef externalName;
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index bb7583bb9a7df..c9b3d78e3de17 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -784,11 +784,11 @@ DefinedImportData *SymbolTable::addImportData(StringRef n, ImportFile *f) {
 }
 
 Symbol *SymbolTable::addImportThunk(StringRef name, DefinedImportData *id,
-                                    uint16_t machine) {
+                                    ImportThunkChunk *chunk) {
   auto [s, wasInserted] = insert(name, nullptr);
   s->isUsedInRegularObj = true;
   if (wasInserted || isa<Undefined>(s) || s->isLazy()) {
-    replaceSymbol<DefinedImportThunk>(s, ctx, name, id, machine);
+    replaceSymbol<DefinedImportThunk>(s, ctx, name, id, chunk);
     return s;
   }
 
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index 51c6c79ec1446..3a277fc700e86 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -28,6 +28,7 @@ class COFFLinkerContext;
 class Defined;
 class DefinedAbsolute;
 class DefinedRegular;
+class ImportThunkChunk;
 class LazyArchive;
 class SectionChunk;
 class Symbol;
@@ -104,7 +105,7 @@ class SymbolTable {
                     CommonChunk *c = nullptr);
   DefinedImportData *addImportData(StringRef n, ImportFile *f);
   Symbol *addImportThunk(StringRef name, DefinedImportData *s,
-                         uint16_t machine);
+                         ImportThunkChunk *chunk);
   void addLibcall(StringRef name);
   void addEntryThunk(Symbol *from, Symbol *to);
   void initializeEntryThunks();
diff --git a/lld/COFF/Symbols.cpp b/lld/COFF/Symbols.cpp
index b098abb80d6f1..5f4d797f74a2d 100644
--- a/lld/COFF/Symbols.cpp
+++ b/lld/COFF/Symbols.cpp
@@ -107,22 +107,10 @@ COFFSymbolRef DefinedCOFF::getCOFFSymbol() {
 
 uint64_t DefinedAbsolute::getRVA() { return va - ctx.config.imageBase; }
 
-static Chunk *makeImportThunk(COFFLinkerContext &ctx, DefinedImportData *s,
-                              uint16_t machine) {
-  if (machine == AMD64)
-    return make<ImportThunkChunkX64>(ctx, s);
-  if (machine == I386)
-    return make<ImportThunkChunkX86>(ctx, s);
-  if (machine == ARM64)
-    return make<ImportThunkChunkARM64>(ctx, s);
-  assert(machine == ARMNT);
-  return make<ImportThunkChunkARM>(ctx, s);
-}
-
 DefinedImportThunk::DefinedImportThunk(COFFLinkerContext &ctx, StringRef name,
-                                       DefinedImportData *s, uint16_t machine)
-    : Defined(DefinedImportThunkKind, name), wrappedSym(s),
-      data(makeImportThunk(ctx, s, machine)) {}
+                                       DefinedImportData *s,
+                                       ImportThunkChunk *chunk)
+    : Defined(DefinedImportThunkKind, name), wrappedSym(s), data(chunk) {}
 
 Defined *Undefined::getWeakAlias() {
   // A weak alias may be a weak alias to another symbol, so check recursively.
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index c427a062dc82b..724330e4bab95 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -388,7 +388,7 @@ class DefinedImportData : public Defined {
 class DefinedImportThunk : public Defined {
 public:
   DefinedImportThunk(COFFLinkerContext &ctx, StringRef name,
-                     DefinedImportData *s, uint16_t machine);
+                     DefinedImportData *s, ImportThunkChunk *chunk);
 
   static bool classof(const Symbol *s) {
     return s->kind() == DefinedImportThunkKind;

From e1ee07d0ff7a37bf5f52d560a52925c0507471e1 Mon Sep 17 00:00:00 2001
From: Akshat Oke <76596238+Akshat-Oke@users.noreply.github.com>
Date: Wed, 11 Sep 2024 16:00:16 +0530
Subject: [PATCH 080/114] [AMDGPU][NewPM] Port SIPeepholeSDWA pass to NPM
 (#107049)

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  6 +-
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |  1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  5 +-
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp     | 56 +++++++++++++------
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h       | 24 ++++++++
 llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir        |  3 +
 llvm/test/CodeGen/AMDGPU/sdwa-ops.mir         |  2 +
 llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir    |  2 +
 8 files changed, 77 insertions(+), 22 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 8d6e022e1e4d4..399aa9c633564 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -36,7 +36,7 @@ void initializeAMDGPURegBankSelectPass(PassRegistry &);
 FunctionPass *createGCNDPPCombinePass();
 FunctionPass *createSIAnnotateControlFlowLegacyPass();
 FunctionPass *createSIFoldOperandsLegacyPass();
-FunctionPass *createSIPeepholeSDWAPass();
+FunctionPass *createSIPeepholeSDWALegacyPass();
 FunctionPass *createSILowerI1CopiesLegacyPass();
 FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass();
 FunctionPass *createSIShrinkInstructionsLegacyPass();
@@ -163,8 +163,8 @@ extern char &GCNDPPCombineLegacyID;
 void initializeSIFoldOperandsLegacyPass(PassRegistry &);
 extern char &SIFoldOperandsLegacyID;
 
-void initializeSIPeepholeSDWAPass(PassRegistry &);
-extern char &SIPeepholeSDWAID;
+void initializeSIPeepholeSDWALegacyPass(PassRegistry &);
+extern char &SIPeepholeSDWALegacyID;
 
 void initializeSIShrinkInstructionsLegacyPass(PassRegistry &);
 extern char &SIShrinkInstructionsLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 58481fe9df239..97661bf9837f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -100,5 +100,6 @@ MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
 MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
 MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
 MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
+MACHINE_FUNCTION_PASS("si-peephole-sdwa", SIPeepholeSDWAPass())
 MACHINE_FUNCTION_PASS("si-shrink-instructions", SIShrinkInstructionsPass())
 #undef MACHINE_FUNCTION_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9c9c505139373..55d0de59bc49a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -39,6 +39,7 @@
 #include "SILoadStoreOptimizer.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIMachineScheduler.h"
+#include "SIPeepholeSDWA.h"
 #include "SIShrinkInstructions.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
@@ -415,7 +416,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIFixSGPRCopiesLegacyPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
   initializeSIFoldOperandsLegacyPass(*PR);
-  initializeSIPeepholeSDWAPass(*PR);
+  initializeSIPeepholeSDWALegacyPass(*PR);
   initializeSIShrinkInstructionsLegacyPass(*PR);
   initializeSIOptimizeExecMaskingPreRAPass(*PR);
   initializeSIOptimizeVGPRLiveRangePass(*PR);
@@ -1275,7 +1276,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
     addPass(&GCNDPPCombineLegacyID);
   addPass(&SILoadStoreOptimizerLegacyID);
   if (isPassEnabled(EnableSDWAPeephole)) {
-    addPass(&SIPeepholeSDWAID);
+    addPass(&SIPeepholeSDWALegacyID);
     addPass(&EarlyMachineLICMID);
     addPass(&MachineCSELegacyID);
     addPass(&SIFoldOperandsLegacyID);
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index d80e1277b2a8a..86cb0e6944ed7 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -19,6 +19,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "SIPeepholeSDWA.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -45,7 +46,7 @@ class SDWADstOperand;
 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
 using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
 
-class SIPeepholeSDWA : public MachineFunctionPass {
+class SIPeepholeSDWA {
 private:
   MachineRegisterInfo *MRI;
   const SIRegisterInfo *TRI;
@@ -57,14 +58,6 @@ class SIPeepholeSDWA : public MachineFunctionPass {
 
   std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
 
-public:
-  static char ID;
-
-  SIPeepholeSDWA() : MachineFunctionPass(ID) {
-    initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
   void matchSDWAOperands(MachineBasicBlock &MBB);
   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
   void pseudoOpConvertToVOP2(MachineInstr &MI,
@@ -72,8 +65,20 @@ class SIPeepholeSDWA : public MachineFunctionPass {
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
 
+public:
+  bool run(MachineFunction &MF);
+};
+
+class SIPeepholeSDWALegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
+
   StringRef getPassName() const override { return "SI Peephole SDWA"; }
 
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -192,17 +197,17 @@ class SDWADstPreserveOperand : public SDWADstOperand {
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
+INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,
+                false)
 
-char SIPeepholeSDWA::ID = 0;
+char SIPeepholeSDWALegacy::ID = 0;
 
-char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
+char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID;
 
-FunctionPass *llvm::createSIPeepholeSDWAPass() {
-  return new SIPeepholeSDWA();
+FunctionPass *llvm::createSIPeepholeSDWALegacyPass() {
+  return new SIPeepholeSDWALegacy();
 }
 
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
   switch(Sel) {
@@ -1235,10 +1240,17 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
   }
 }
 
-bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
+bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  return SIPeepholeSDWA().run(MF);
+}
+
+bool SIPeepholeSDWA::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
-  if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
+  if (!ST.hasSDWA())
     return false;
 
   MRI = &MF.getRegInfo();
@@ -1295,3 +1307,13 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
 
   return Ret;
 }
+
+PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF,
+                                          MachineFunctionAnalysisManager &) {
+  if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h
new file mode 100644
index 0000000000000..217867220f7d8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h
@@ -0,0 +1,24 @@
+//===--------- SIPeepholeSDWA.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIPEEPHOLESDWA_H
+#define LLVM_LIB_TARGET_AMDGPU_SIPEEPHOLESDWA_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class SIPeepholeSDWAPass : public PassInfoMixin<SIPeepholeSDWAPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIPEEPHOLESDWA_H
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir b/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
index 4ff43024ae8cc..0c9f628dfbb2c 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
@@ -1,6 +1,9 @@
 # RUN: llc -mtriple=amdgcn -mcpu=kaveri -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=kaveri -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
 
 # GCN-LABEL: {{^}}name: add_shr_i32
 # GCN: [[SMOV:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 123
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir b/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
index ef986f8c9d2a3..0ad1b5527c854 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
@@ -1,5 +1,7 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s
 
 # test for 3 consecutive _sdwa's
 # GFX9-LABEL: name:            test1_add_co_sdwa
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index 4ca39ecc7a0ae..ffbd2d092b5d8 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDWA %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDWA %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=SDWA %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=SDWA %s
 ---
 name:            add_f16_u32_preserve
 tracksRegLiveness: true

From da6944912baffa430468078c38f65d55fb83dd43 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 11 Sep 2024 11:36:27 +0100
Subject: [PATCH 081/114] [lldb][test] Add test for no_unique_address when
 mixed with bitfields (#108155)

This is the root-cause for the LLDB failures that started occurring
after https://github.com/llvm/llvm-project/pull/105865.

The DWARFASTParserClang has logic to try derive unnamed bitfields from
DWARF offsets. In this case we treat `padding` as a 1-byte size field
that would overlap with `flag`, and decide we need to introduce an
unnamed bitfield into the AST, which is incorrect.
---
 .../no_unique_address-with-bitfields.cpp      | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp

diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp b/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp
new file mode 100644
index 0000000000000..1c9cc36a711b4
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp
@@ -0,0 +1,28 @@
+// LLDB currently erroneously adds an unnamed bitfield
+// into the AST when an overlapping no_unique_address
+// field precedes a bitfield.
+
+// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s
+// RUN: %lldb %t \
+// RUN:   -o "target var global" \
+// RUN:   -o "image dump ast" \
+// RUN:   -o exit | FileCheck %s
+
+// CHECK:      (lldb) image dump ast
+// CHECK:      CXXRecordDecl {{.*}} struct Foo definition
+// CHECK:      |-FieldDecl {{.*}} data 'char[5]'
+// CHECK-NEXT: |-FieldDecl {{.*}} padding 'Empty'
+// CHECK-NEXT: |-FieldDecl {{.*}} 'int'
+// CHECK-NEXT: | `-IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: `-FieldDecl {{.*}} sloc> flag 'unsigned long'
+// CHECK-NEXT:   `-IntegerLiteral {{.*}} 'int' 1
+
+struct Empty {};
+
+struct Foo {
+  char data[5];
+  [[no_unique_address]] Empty padding;
+  unsigned long flag : 1;
+};
+
+Foo global;

From 2f3d061918ece414d6db544a34b2e44a9950bc23 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Wed, 11 Sep 2024 12:16:34 +0100
Subject: [PATCH 082/114] [MLIR][OpenMP] Automate operand structure definition
 (#99508)

This patch adds the "gen-openmp-clause-ops" `mlir-tblgen` generator to
produce the structure definitions previously in OpenMPClauseOperands.h
automatically from the information contained in OpenMPOps.td and
OpenMPClauses.td.

The original header is maintained to enable the definition of similar
structures that are not directly related to any single `OpenMP_Clause`
or `OpenMP_Op` tablegen definition.
---
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp    |   6 +-
 flang/lib/Lower/OpenMP/ClauseProcessor.h      |   2 +-
 .../mlir/Dialect/OpenMP/CMakeLists.txt        |   1 +
 .../Dialect/OpenMP/OpenMPClauseOperands.h     | 292 +-----------------
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |  34 +-
 mlir/test/mlir-tblgen/openmp-clause-ops.td    |  86 ++++++
 mlir/tools/mlir-tblgen/OmpOpGen.cpp           | 215 ++++++++++++-
 7 files changed, 319 insertions(+), 317 deletions(-)
 create mode 100644 mlir/test/mlir-tblgen/openmp-clause-ops.td

diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 3f54234b176e3..f336d213cc862 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -181,7 +181,7 @@ static void addUseDeviceClause(
 
 static void convertLoopBounds(lower::AbstractConverter &converter,
                               mlir::Location loc,
-                              mlir::omp::LoopRelatedOps &result,
+                              mlir::omp::LoopRelatedClauseOps &result,
                               std::size_t loopVarTypeSize) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   // The types of lower bound, upper bound, and step are converted into the
@@ -203,7 +203,7 @@ static void convertLoopBounds(lower::AbstractConverter &converter,
 
 bool ClauseProcessor::processCollapse(
     mlir::Location currentLocation, lower::pft::Evaluation &eval,
-    mlir::omp::LoopRelatedOps &result,
+    mlir::omp::LoopRelatedClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const {
   bool found = false;
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -855,7 +855,7 @@ bool ClauseProcessor::processIf(
     // Assume that, at most, a single 'if' clause will be applicable to the
     // given directive.
     if (operand) {
-      result.ifVar = operand;
+      result.ifExpr = operand;
       found = true;
     }
   });
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index f6b319c726a2d..8d02d368f4ee0 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -55,7 +55,7 @@ class ClauseProcessor {
   // 'Unique' clauses: They can appear at most once in the clause list.
   bool
   processCollapse(mlir::Location currentLocation, lower::pft::Evaluation &eval,
-                  mlir::omp::LoopRelatedOps &result,
+                  mlir::omp::LoopRelatedClauseOps &result,
                   llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const;
   bool processDevice(lower::StatementContext &stmtCtx,
                      mlir::omp::DeviceClauseOps &result) const;
diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
index dd349d1392e7b..a65c6b1d3c96b 100644
--- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
@@ -17,6 +17,7 @@ mlir_tablegen(OpenMPOpsDialect.h.inc -gen-dialect-decls -dialect=omp)
 mlir_tablegen(OpenMPOpsDialect.cpp.inc -gen-dialect-defs -dialect=omp)
 mlir_tablegen(OpenMPOps.h.inc -gen-op-decls)
 mlir_tablegen(OpenMPOps.cpp.inc -gen-op-defs)
+mlir_tablegen(OpenMPClauseOps.h.inc -gen-openmp-clause-ops)
 mlir_tablegen(OpenMPOpsTypes.h.inc -gen-typedef-decls -typedefs-dialect=omp)
 mlir_tablegen(OpenMPOpsTypes.cpp.inc -gen-typedef-defs -typedefs-dialect=omp)
 mlir_tablegen(OpenMPOpsEnums.h.inc -gen-enum-decls)
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
index 38e4d8f245e4f..1247a871f93c6 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
@@ -23,303 +23,31 @@
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.h.inc"
 
+#include "mlir/Dialect/OpenMP/OpenMPClauseOps.h.inc"
+
 namespace mlir {
 namespace omp {
 
 //===----------------------------------------------------------------------===//
-// Mixin structures defining MLIR operands associated with each OpenMP clause.
+// Extra clause operand structures.
 //===----------------------------------------------------------------------===//
 
-struct AlignedClauseOps {
-  llvm::SmallVector<Value> alignedVars;
-  llvm::SmallVector<Attribute> alignments;
-};
-
-struct AllocateClauseOps {
-  llvm::SmallVector<Value> allocateVars, allocatorVars;
-};
-
-struct CancelDirectiveNameClauseOps {
-  ClauseCancellationConstructTypeAttr cancelDirective;
-};
-
-struct CopyprivateClauseOps {
-  llvm::SmallVector<Value> copyprivateVars;
-  llvm::SmallVector<Attribute> copyprivateSyms;
-};
-
-struct CriticalNameClauseOps {
-  /// This field has a generic name because it's mirroring the `sym_name`
-  /// argument of the `OpenMP_CriticalNameClause` tablegen definition. That one
-  /// can't be renamed to anything more specific because the `sym_name` name is
-  /// a requirement of the `Symbol` MLIR trait associated with that clause.
-  StringAttr symName;
-};
-
-struct DependClauseOps {
-  llvm::SmallVector<Attribute> dependKinds;
-  llvm::SmallVector<Value> dependVars;
-};
-
-struct DeviceClauseOps {
-  Value device;
-};
-
 struct DeviceTypeClauseOps {
-  // The default capture type.
+  /// The default capture type.
   DeclareTargetDeviceType deviceType = DeclareTargetDeviceType::any;
 };
 
-struct DistScheduleClauseOps {
-  UnitAttr distScheduleStatic;
-  Value distScheduleChunkSize;
-};
-
-struct DoacrossClauseOps {
-  ClauseDependAttr doacrossDependType;
-  IntegerAttr doacrossNumLoops;
-  llvm::SmallVector<Value> doacrossDependVars;
-};
-
-struct FilterClauseOps {
-  Value filteredThreadId;
-};
-
-struct FinalClauseOps {
-  Value final;
-};
-
-struct GrainsizeClauseOps {
-  Value grainsize;
-};
-
-struct HasDeviceAddrClauseOps {
-  llvm::SmallVector<Value> hasDeviceAddrVars;
-};
-
-struct HintClauseOps {
-  IntegerAttr hint;
-};
-
-struct IfClauseOps {
-  Value ifVar;
-};
-
-struct InReductionClauseOps {
-  llvm::SmallVector<Value> inReductionVars;
-  llvm::SmallVector<bool> inReductionByref;
-  llvm::SmallVector<Attribute> inReductionSyms;
-};
-
-struct IsDevicePtrClauseOps {
-  llvm::SmallVector<Value> isDevicePtrVars;
-};
-
-struct LinearClauseOps {
-  llvm::SmallVector<Value> linearVars, linearStepVars;
-};
-
-struct LoopRelatedOps {
-  llvm::SmallVector<Value> loopLowerBounds, loopUpperBounds, loopSteps;
-  UnitAttr loopInclusive;
-};
-
-struct MapClauseOps {
-  llvm::SmallVector<Value> mapVars;
-};
-
-struct MergeableClauseOps {
-  UnitAttr mergeable;
-};
-
-struct NogroupClauseOps {
-  UnitAttr nogroup;
-};
-
-struct NontemporalClauseOps {
-  llvm::SmallVector<Value> nontemporalVars;
-};
-
-struct NowaitClauseOps {
-  UnitAttr nowait;
-};
-
-struct NumTasksClauseOps {
-  Value numTasks;
-};
-
-struct NumTeamsClauseOps {
-  Value numTeamsLower, numTeamsUpper;
-};
-
-struct NumThreadsClauseOps {
-  Value numThreads;
-};
-
-struct OrderClauseOps {
-  ClauseOrderKindAttr order;
-  OrderModifierAttr orderMod;
-};
-
-struct OrderedClauseOps {
-  IntegerAttr ordered;
-};
-
-struct ParallelizationLevelClauseOps {
-  UnitAttr parLevelSimd;
-};
-
-struct PriorityClauseOps {
-  Value priority;
-};
-
-struct PrivateClauseOps {
-  // SSA values that correspond to "original" values being privatized.
-  // They refer to the SSA value outside the OpenMP region from which a clone is
-  // created inside the region.
-  llvm::SmallVector<Value> privateVars;
-  // The list of symbols referring to delayed privatizer ops (i.e. `omp.private`
-  // ops).
-  llvm::SmallVector<Attribute> privateSyms;
-};
-
-struct ProcBindClauseOps {
-  ClauseProcBindKindAttr procBindKind;
-};
-
-struct ReductionClauseOps {
-  llvm::SmallVector<Value> reductionVars;
-  llvm::SmallVector<bool> reductionByref;
-  llvm::SmallVector<Attribute> reductionSyms;
-};
-
-struct SafelenClauseOps {
-  IntegerAttr safelen;
-};
-
-struct ScheduleClauseOps {
-  ClauseScheduleKindAttr scheduleKind;
-  Value scheduleChunk;
-  ScheduleModifierAttr scheduleMod;
-  UnitAttr scheduleSimd;
-};
-
-struct SimdlenClauseOps {
-  IntegerAttr simdlen;
-};
-
-struct TaskReductionClauseOps {
-  llvm::SmallVector<Value> taskReductionVars;
-  llvm::SmallVector<bool> taskReductionByref;
-  llvm::SmallVector<Attribute> taskReductionSyms;
-};
-
-struct ThreadLimitClauseOps {
-  Value threadLimit;
-};
-
-struct UntiedClauseOps {
-  UnitAttr untied;
-};
-
-struct UseDeviceAddrClauseOps {
-  llvm::SmallVector<Value> useDeviceAddrVars;
-};
-
-struct UseDevicePtrClauseOps {
-  llvm::SmallVector<Value> useDevicePtrVars;
-};
-
 //===----------------------------------------------------------------------===//
-// Structures defining clause operands associated with each OpenMP leaf
-// construct.
-//
-// These mirror the arguments expected by the corresponding OpenMP MLIR ops.
+// Extra operation operand structures.
 //===----------------------------------------------------------------------===//
 
-namespace detail {
-template <typename... Mixins>
-struct Clauses : public Mixins... {};
-} // namespace detail
-
-using CancelOperands =
-    detail::Clauses<CancelDirectiveNameClauseOps, IfClauseOps>;
-
-using CancellationPointOperands = detail::Clauses<CancelDirectiveNameClauseOps>;
-
-using CriticalDeclareOperands =
-    detail::Clauses<CriticalNameClauseOps, HintClauseOps>;
-
-// TODO `indirect` clause.
+// TODO: Add `indirect` clause.
 using DeclareTargetOperands = detail::Clauses<DeviceTypeClauseOps>;
 
-using DistributeOperands =
-    detail::Clauses<AllocateClauseOps, DistScheduleClauseOps, OrderClauseOps,
-                    PrivateClauseOps>;
-
-using LoopNestOperands = detail::Clauses<LoopRelatedOps>;
-
-using MaskedOperands = detail::Clauses<FilterClauseOps>;
-
-using OrderedOperands = detail::Clauses<DoacrossClauseOps>;
-
-using OrderedRegionOperands = detail::Clauses<ParallelizationLevelClauseOps>;
-
-using ParallelOperands =
-    detail::Clauses<AllocateClauseOps, IfClauseOps, NumThreadsClauseOps,
-                    PrivateClauseOps, ProcBindClauseOps, ReductionClauseOps>;
-
-using SectionsOperands = detail::Clauses<AllocateClauseOps, NowaitClauseOps,
-                                         PrivateClauseOps, ReductionClauseOps>;
-
-using SimdOperands =
-    detail::Clauses<AlignedClauseOps, IfClauseOps, LinearClauseOps,
-                    NontemporalClauseOps, OrderClauseOps, PrivateClauseOps,
-                    ReductionClauseOps, SafelenClauseOps, SimdlenClauseOps>;
-
-using SingleOperands = detail::Clauses<AllocateClauseOps, CopyprivateClauseOps,
-                                       NowaitClauseOps, PrivateClauseOps>;
-
-// TODO `defaultmap`, `uses_allocators` clauses.
-using TargetOperands =
-    detail::Clauses<AllocateClauseOps, DependClauseOps, DeviceClauseOps,
-                    HasDeviceAddrClauseOps, IfClauseOps, InReductionClauseOps,
-                    IsDevicePtrClauseOps, MapClauseOps, NowaitClauseOps,
-                    PrivateClauseOps, ThreadLimitClauseOps>;
-
-using TargetDataOperands =
-    detail::Clauses<DeviceClauseOps, IfClauseOps, MapClauseOps,
-                    UseDeviceAddrClauseOps, UseDevicePtrClauseOps>;
-
-using TargetEnterExitUpdateDataOperands =
-    detail::Clauses<DependClauseOps, DeviceClauseOps, IfClauseOps, MapClauseOps,
-                    NowaitClauseOps>;
-
-// TODO `affinity`, `detach` clauses.
-using TaskOperands =
-    detail::Clauses<AllocateClauseOps, DependClauseOps, FinalClauseOps,
-                    IfClauseOps, InReductionClauseOps, MergeableClauseOps,
-                    PriorityClauseOps, PrivateClauseOps, UntiedClauseOps>;
-
-using TaskgroupOperands =
-    detail::Clauses<AllocateClauseOps, TaskReductionClauseOps>;
-
-using TaskloopOperands =
-    detail::Clauses<AllocateClauseOps, FinalClauseOps, GrainsizeClauseOps,
-                    IfClauseOps, InReductionClauseOps, MergeableClauseOps,
-                    NogroupClauseOps, NumTasksClauseOps, PriorityClauseOps,
-                    PrivateClauseOps, ReductionClauseOps, UntiedClauseOps>;
-
-using TaskwaitOperands = detail::Clauses<DependClauseOps, NowaitClauseOps>;
-
-using TeamsOperands =
-    detail::Clauses<AllocateClauseOps, IfClauseOps, NumTeamsClauseOps,
-                    PrivateClauseOps, ReductionClauseOps, ThreadLimitClauseOps>;
-
-using WsloopOperands =
-    detail::Clauses<AllocateClauseOps, LinearClauseOps, NowaitClauseOps,
-                    OrderClauseOps, OrderedClauseOps, PrivateClauseOps,
-                    ReductionClauseOps, ScheduleClauseOps>;
+/// omp.target_enter_data, omp.target_exit_data and omp.target_update take the
+/// same clauses, so we give the structure to be shared by all of them a
+/// representative name.
+using TargetEnterExitUpdateDataOperands = TargetEnterDataOperands;
 
 } // namespace omp
 } // namespace mlir
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 1a9b87f0d68c9..e4ed58f26016a 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1370,7 +1370,7 @@ static LogicalResult verifyMapClause(Operation *op, OperandRange mapVars) {
 
 void TargetDataOp::build(OpBuilder &builder, OperationState &state,
                          const TargetDataOperands &clauses) {
-  TargetDataOp::build(builder, state, clauses.device, clauses.ifVar,
+  TargetDataOp::build(builder, state, clauses.device, clauses.ifExpr,
                       clauses.mapVars, clauses.useDeviceAddrVars,
                       clauses.useDevicePtrVars);
 }
@@ -1395,7 +1395,7 @@ void TargetEnterDataOp::build(
   MLIRContext *ctx = builder.getContext();
   TargetEnterDataOp::build(builder, state,
                            makeArrayAttr(ctx, clauses.dependKinds),
-                           clauses.dependVars, clauses.device, clauses.ifVar,
+                           clauses.dependVars, clauses.device, clauses.ifExpr,
                            clauses.mapVars, clauses.nowait);
 }
 
@@ -1415,7 +1415,7 @@ void TargetExitDataOp::build(OpBuilder &builder, OperationState &state,
   MLIRContext *ctx = builder.getContext();
   TargetExitDataOp::build(builder, state,
                           makeArrayAttr(ctx, clauses.dependKinds),
-                          clauses.dependVars, clauses.device, clauses.ifVar,
+                          clauses.dependVars, clauses.device, clauses.ifExpr,
                           clauses.mapVars, clauses.nowait);
 }
 
@@ -1434,7 +1434,7 @@ void TargetUpdateOp::build(OpBuilder &builder, OperationState &state,
                            const TargetEnterExitUpdateDataOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   TargetUpdateOp::build(builder, state, makeArrayAttr(ctx, clauses.dependKinds),
-                        clauses.dependVars, clauses.device, clauses.ifVar,
+                        clauses.dependVars, clauses.device, clauses.ifExpr,
                         clauses.mapVars, clauses.nowait);
 }
 
@@ -1456,7 +1456,7 @@ void TargetOp::build(OpBuilder &builder, OperationState &state,
   // inReductionByref, inReductionSyms.
   TargetOp::build(builder, state, /*allocate_vars=*/{}, /*allocator_vars=*/{},
                   makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars,
-                  clauses.device, clauses.hasDeviceAddrVars, clauses.ifVar,
+                  clauses.device, clauses.hasDeviceAddrVars, clauses.ifExpr,
                   /*in_reduction_vars=*/{}, /*in_reduction_byref=*/nullptr,
                   /*in_reduction_syms=*/nullptr, clauses.isDevicePtrVars,
                   clauses.mapVars, clauses.nowait, clauses.privateVars,
@@ -1488,9 +1488,8 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
 void ParallelOp::build(OpBuilder &builder, OperationState &state,
                        const ParallelOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-
   ParallelOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
-                    clauses.ifVar, clauses.numThreads, clauses.privateVars,
+                    clauses.ifExpr, clauses.numThreads, clauses.privateVars,
                     makeArrayAttr(ctx, clauses.privateSyms),
                     clauses.procBindKind, clauses.reductionVars,
                     makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
@@ -1588,13 +1587,12 @@ void TeamsOp::build(OpBuilder &builder, OperationState &state,
                     const TeamsOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: privateVars, privateSyms.
-  TeamsOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
-                 clauses.ifVar, clauses.numTeamsLower, clauses.numTeamsUpper,
-                 /*private_vars=*/{},
-                 /*private_syms=*/nullptr, clauses.reductionVars,
-                 makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
-                 makeArrayAttr(ctx, clauses.reductionSyms),
-                 clauses.threadLimit);
+  TeamsOp::build(
+      builder, state, clauses.allocateVars, clauses.allocatorVars,
+      clauses.ifExpr, clauses.numTeamsLower, clauses.numTeamsUpper,
+      /*private_vars=*/{}, /*private_syms=*/nullptr, clauses.reductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+      makeArrayAttr(ctx, clauses.reductionSyms), clauses.threadLimit);
 }
 
 LogicalResult TeamsOp::verify() {
@@ -1814,7 +1812,7 @@ void SimdOp::build(OpBuilder &builder, OperationState &state,
   // TODO Store clauses in op: linearVars, linearStepVars, privateVars,
   // privateSyms, reductionVars, reductionByref, reductionSyms.
   SimdOp::build(builder, state, clauses.alignedVars,
-                makeArrayAttr(ctx, clauses.alignments), clauses.ifVar,
+                makeArrayAttr(ctx, clauses.alignments), clauses.ifExpr,
                 /*linear_vars=*/{}, /*linear_step_vars=*/{},
                 clauses.nontemporalVars, clauses.order, clauses.orderMod,
                 /*private_vars=*/{}, /*private_syms=*/nullptr,
@@ -1996,7 +1994,7 @@ void TaskOp::build(OpBuilder &builder, OperationState &state,
   // TODO Store clauses in op: privateVars, privateSyms.
   TaskOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
                 makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars,
-                clauses.final, clauses.ifVar, clauses.inReductionVars,
+                clauses.final, clauses.ifExpr, clauses.inReductionVars,
                 makeDenseBoolArrayAttr(ctx, clauses.inReductionByref),
                 makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable,
                 clauses.priority, /*private_vars=*/{}, /*private_syms=*/nullptr,
@@ -2042,7 +2040,7 @@ void TaskloopOp::build(OpBuilder &builder, OperationState &state,
   // TODO Store clauses in op: privateVars, privateSyms.
   TaskloopOp::build(
       builder, state, clauses.allocateVars, clauses.allocatorVars,
-      clauses.final, clauses.grainsize, clauses.ifVar, clauses.inReductionVars,
+      clauses.final, clauses.grainsize, clauses.ifExpr, clauses.inReductionVars,
       makeDenseBoolArrayAttr(ctx, clauses.inReductionByref),
       makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable,
       clauses.nogroup, clauses.numTasks, clauses.priority, /*private_vars=*/{},
@@ -2424,7 +2422,7 @@ LogicalResult AtomicCaptureOp::verifyRegions() {
 
 void CancelOp::build(OpBuilder &builder, OperationState &state,
                      const CancelOperands &clauses) {
-  CancelOp::build(builder, state, clauses.cancelDirective, clauses.ifVar);
+  CancelOp::build(builder, state, clauses.cancelDirective, clauses.ifExpr);
 }
 
 LogicalResult CancelOp::verify() {
diff --git a/mlir/test/mlir-tblgen/openmp-clause-ops.td b/mlir/test/mlir-tblgen/openmp-clause-ops.td
new file mode 100644
index 0000000000000..cee3f2a693bf8
--- /dev/null
+++ b/mlir/test/mlir-tblgen/openmp-clause-ops.td
@@ -0,0 +1,86 @@
+// Tablegen tests for the automatic generation of OpenMP clause operand
+// structure definitions.
+
+// Run tablegen to generate OmpCommon.td in temp directory first.
+// RUN: mkdir -p %t/mlir/Dialect/OpenMP
+// RUN: mlir-tblgen --gen-directive-decl --directives-dialect=OpenMP \
+// RUN:   %S/../../../llvm/include/llvm/Frontend/OpenMP/OMP.td \
+// RUN:   -I %S/../../../llvm/include > %t/mlir/Dialect/OpenMP/OmpCommon.td
+
+// RUN: mlir-tblgen -gen-openmp-clause-ops -I %S/../../include -I %t %s 2>&1 | FileCheck %s
+
+include "mlir/Dialect/OpenMP/OpenMPOpBase.td"
+
+
+def OpenMP_MyFirstClause : OpenMP_Clause<
+    /*isRequired=*/false, /*skipTraits=*/false, /*skipArguments=*/false,
+    /*skipAssemblyFormat=*/false, /*skipDescription=*/false,
+    /*skipExtraClassDeclaration=*/false> {
+  let arguments = (ins
+    // Simple attributes
+    I32Attr:$int_attr,
+    TypeAttr:$type_attr,
+    DeclareTargetAttr:$omp_attr,
+
+    // Array attributes
+    F32ArrayAttr:$float_array_attr,
+    StrArrayAttr:$str_array_attr,
+    AnyIntElementsAttr:$anyint_elems_attr,
+    RankedF32ElementsAttr<[3, 4, 5]>:$float_nd_elems_attr,
+
+    // Optional attributes
+    OptionalAttr<BoolAttr>:$opt_bool_attr,
+    OptionalAttr<I64ArrayAttr>:$opt_int_array_attr,
+    OptionalAttr<DenseI8ArrayAttr>:$opt_int_elems_attr,
+
+    // Multi-level composition
+    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$complex_opt_int_attr,
+
+    // ElementsAttrBase-related edge cases.
+    // CHECK: warning: could not infer array-like attribute element type for argument 'elements_attr', will use bare `storageType`
+    ElementsAttr:$elements_attr,
+    // CHECK: warning: could not infer array-like attribute element type for argument 'string_elements_attr', will use bare `storageType`
+    StringElementsAttr:$string_elements_attr
+  );
+}
+// CHECK:      struct MyFirstClauseOps {
+// CHECK-NEXT:   ::mlir::IntegerAttr intAttr;
+// CHECK-NEXT:   ::mlir::TypeAttr typeAttr;
+// CHECK-NEXT:   ::mlir::omp::DeclareTargetAttr ompAttr;
+
+// CHECK-NEXT:   ::llvm::SmallVector<::mlir::Attribute> floatArrayAttr;
+// CHECK-NEXT:   ::llvm::SmallVector<::mlir::Attribute> strArrayAttr;
+// CHECK-NEXT:   ::llvm::SmallVector<::llvm::APInt> anyintElemsAttr;
+// CHECK-NEXT:   ::llvm::SmallVector<::llvm::APFloat> floatNdElemsAttr;
+
+// CHECK-NEXT:   ::mlir::BoolAttr optBoolAttr;
+// CHECK-NEXT:   ::llvm::SmallVector<::mlir::Attribute> optIntArrayAttr;
+// CHECK-NEXT:   ::llvm::SmallVector<int8_t> optIntElemsAttr;
+
+// CHECK-NEXT:   ::mlir::IntegerAttr complexOptIntAttr;
+
+// CHECK-NEXT:   ::mlir::ElementsAttr elementsAttr;
+// CHECK-NEXT:   ::mlir::DenseElementsAttr stringElementsAttr;
+// CHECK-NEXT: }
+
+def OpenMP_MySecondClause : OpenMP_Clause<
+    /*isRequired=*/false, /*skipTraits=*/false, /*skipArguments=*/false,
+    /*skipAssemblyFormat=*/false, /*skipDescription=*/false,
+    /*skipExtraClassDeclaration=*/false> {
+  let arguments = (ins
+    I32:$int_val,
+    Optional<AnyType>:$opt_any_val,
+    Variadic<Index>:$variadic_index_val
+  );
+}
+// CHECK:      struct MySecondClauseOps {
+// CHECK-NEXT:   ::mlir::Value intVal;
+// CHECK-NEXT:   ::mlir::Value optAnyVal;
+// CHECK-NEXT:   ::llvm::SmallVector<::mlir::Value> variadicIndexVal;
+// CHECK-NEXT: }
+
+def OpenMP_MyFirstOp : OpenMP_Op<"op", clauses=[OpenMP_MyFirstClause]>;
+// CHECK: using MyFirstOperands = detail::Clauses<MyFirstClauseOps>;
+
+def OpenMP_MySecondOp : OpenMP_Op<"op", clauses=[OpenMP_MyFirstClause, OpenMP_MySecondClause]>;
+// CHECK: using MySecondOperands = detail::Clauses<MyFirstClauseOps, MySecondClauseOps>;
diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
index 1545821263788..f546e1b1b6691 100644
--- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp
+++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
@@ -12,11 +12,54 @@
 
 #include "mlir/TableGen/GenInfo.h"
 
+#include "mlir/TableGen/CodeGenHelpers.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/FormatAdapters.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 
 using namespace llvm;
 
+/// The code block defining the base mixin class for combining clause operand
+/// structures.
+static const char *const baseMixinClass = R"(
+namespace detail {
+template <typename... Mixins>
+struct Clauses : public Mixins... {};
+} // namespace detail
+)";
+
+/// The code block defining operation argument structures.
+static const char *const operationArgStruct = R"(
+using {0}Operands = detail::Clauses<{1}>;
+)";
+
+/// Remove multiple optional prefixes and suffixes from \c str.
+///
+/// Prefixes and suffixes are attempted to be removed once in the order they
+/// appear in the \c prefixes and \c suffixes arguments. All prefixes are
+/// processed before suffixes are. This means it will behave as shown in the
+/// following example:
+///   - str: "PrePreNameSuf1Suf2"
+///   - prefixes: ["Pre"]
+///   - suffixes: ["Suf1", "Suf2"]
+///   - return: "PreNameSuf1"
+static StringRef stripPrefixAndSuffix(StringRef str,
+                                      llvm::ArrayRef<StringRef> prefixes,
+                                      llvm::ArrayRef<StringRef> suffixes) {
+  for (StringRef prefix : prefixes)
+    if (str.starts_with(prefix))
+      str = str.drop_front(prefix.size());
+
+  for (StringRef suffix : suffixes)
+    if (str.ends_with(suffix))
+      str = str.drop_back(suffix.size());
+
+  return str;
+}
+
 /// Obtain the name of the OpenMP clause a given record inheriting
 /// `OpenMP_Clause` refers to.
 ///
@@ -53,19 +96,8 @@ static StringRef extractOmpClauseName(const Record *clause) {
   assert(!clauseClassName.empty() && "clause name must be found");
 
   // Keep only the OpenMP clause name itself for reporting purposes.
-  StringRef prefix = "OpenMP_";
-  StringRef suffixes[] = {"Skip", "Clause"};
-
-  if (clauseClassName.starts_with(prefix))
-    clauseClassName = clauseClassName.substr(prefix.size());
-
-  for (StringRef suffix : suffixes) {
-    if (clauseClassName.ends_with(suffix))
-      clauseClassName =
-          clauseClassName.substr(0, clauseClassName.size() - suffix.size());
-  }
-
-  return clauseClassName;
+  return stripPrefixAndSuffix(clauseClassName, /*prefixes=*/{"OpenMP_"},
+                              /*suffixes=*/{"Skip", "Clause"});
 }
 
 /// Check that the given argument, identified by its name and initialization
@@ -148,6 +180,139 @@ static void verifyClause(const Record *op, const Record *clause) {
             "or explicitly skipping this field.");
 }
 
+/// Translate the type of an OpenMP clause's argument to its corresponding
+/// representation for clause operand structures.
+///
+/// All kinds of values are represented as `mlir::Value` fields, whereas
+/// attributes are represented based on their `storageType`.
+///
+/// \param[in] name The name of the argument.
+/// \param[in] init The `DefInit` object representing the argument.
+/// \param[out] nest Number of levels of array nesting associated with the
+///                  type. Must be initially set to 0.
+/// \param[out] rank Rank (number of dimensions, if an array type) of the base
+///                  type. Must be initially set to 1.
+///
+/// \return the name of the base type to represent elements of the argument
+///         type.
+static StringRef translateArgumentType(ArrayRef<SMLoc> loc, StringInit *name,
+                                       Init *init, int &nest, int &rank) {
+  Record *def = cast<DefInit>(init)->getDef();
+
+  llvm::StringSet superClasses;
+  for (auto [sc, _] : def->getSuperClasses())
+    superClasses.insert(sc->getNameInitAsString());
+
+  // Handle wrapper-style superclasses.
+  if (superClasses.contains("OptionalAttr"))
+    return translateArgumentType(
+        loc, name, def->getValue("baseAttr")->getValue(), nest, rank);
+
+  if (superClasses.contains("TypedArrayAttrBase"))
+    return translateArgumentType(
+        loc, name, def->getValue("elementAttr")->getValue(), ++nest, rank);
+
+  // Handle ElementsAttrBase superclasses.
+  if (superClasses.contains("ElementsAttrBase")) {
+    // TODO: Obtain the rank from ranked types.
+    ++nest;
+
+    if (superClasses.contains("IntElementsAttrBase"))
+      return "::llvm::APInt";
+    if (superClasses.contains("FloatElementsAttr") ||
+        superClasses.contains("RankedFloatElementsAttr"))
+      return "::llvm::APFloat";
+    if (superClasses.contains("DenseArrayAttrBase"))
+      return stripPrefixAndSuffix(def->getValueAsString("returnType"),
+                                  {"::llvm::ArrayRef<"}, {">"});
+
+    // Decrease the nesting depth in the case where the base type cannot be
+    // inferred, so that the bare storageType is used instead of a vector.
+    --nest;
+    PrintWarning(
+        loc,
+        "could not infer array-like attribute element type for argument '" +
+            name->getAsUnquotedString() + "', will use bare `storageType`");
+  }
+
+  // Handle simple attribute and value types.
+  bool isAttr = superClasses.contains("Attr");
+  bool isValue = superClasses.contains("TypeConstraint");
+  if (superClasses.contains("Variadic"))
+    ++nest;
+
+  if (isValue) {
+    assert(!isAttr &&
+           "argument can't be simultaneously a value and an attribute");
+    return "::mlir::Value";
+  }
+
+  assert(isAttr && "argument must be an attribute if it's not a value");
+  return nest > 0 ? "::mlir::Attribute"
+                  : def->getValueAsString("storageType").trim();
+}
+
+/// Generate the structure that represents the arguments of the given \c clause
+/// record of type \c OpenMP_Clause.
+///
+/// It will contain a field for each argument, using the same name translated to
+/// camel case and the corresponding base type as returned by
+/// translateArgumentType() optionally wrapped in one or more llvm::SmallVector.
+///
+/// An additional field containing a tuple of integers to hold the size of each
+/// dimension will also be created for multi-rank types. This is not yet
+/// supported.
+static void genClauseOpsStruct(const Record *clause, raw_ostream &os) {
+  if (clause->isAnonymous())
+    return;
+
+  StringRef clauseName = extractOmpClauseName(clause);
+  os << "struct " << clauseName << "ClauseOps {\n";
+
+  DagInit *arguments = clause->getValueAsDag("arguments");
+  for (auto [name, arg] :
+       zip_equal(arguments->getArgNames(), arguments->getArgs())) {
+    int nest = 0, rank = 1;
+    StringRef baseType =
+        translateArgumentType(clause->getLoc(), name, arg, nest, rank);
+    std::string fieldName =
+        convertToCamelFromSnakeCase(name->getAsUnquotedString(),
+                                    /*capitalizeFirst=*/false);
+
+    os << formatv("  {0}{1}{2} {3};\n",
+                  fmt_repeat("::llvm::SmallVector<", nest), baseType,
+                  fmt_repeat(">", nest), fieldName);
+
+    if (rank > 1) {
+      assert(nest >= 1 && "must be nested if it's a ranked type");
+      os << formatv("  {0}::std::tuple<{1}int>{2} {3}Dims;\n",
+                    fmt_repeat("::llvm::SmallVector<", nest - 1),
+                    fmt_repeat("int, ", rank - 1), fmt_repeat(">", nest - 1),
+                    fieldName);
+    }
+  }
+
+  os << "};\n";
+}
+
+/// Generate the structure that represents the clause-related arguments of the
+/// given \c op record of type \c OpenMP_Op.
+///
+/// This structure will be defined in terms of the clause operand structures
+/// associated to the clauses of the operation.
+static void genOperandsDef(const Record *op, raw_ostream &os) {
+  if (op->isAnonymous())
+    return;
+
+  SmallVector<std::string> clauseNames;
+  for (Record *clause : op->getValueAsListOfDefs("clauseList"))
+    clauseNames.push_back((extractOmpClauseName(clause) + "ClauseOps").str());
+
+  StringRef opName = stripPrefixAndSuffix(
+      op->getName(), /*prefixes=*/{"OpenMP_"}, /*suffixes=*/{"Op"});
+  os << formatv(operationArgStruct, opName, join(clauseNames, ", "));
+}
+
 /// Verify that all properties of `OpenMP_Clause`s of records deriving from
 /// `OpenMP_Op`s have been inherited by the latter.
 static bool verifyDecls(const RecordKeeper &recordKeeper, raw_ostream &) {
@@ -159,8 +324,32 @@ static bool verifyDecls(const RecordKeeper &recordKeeper, raw_ostream &) {
   return false;
 }
 
+/// Generate structures to represent clause-related operands, based on existing
+/// `OpenMP_Clause` definitions and aggregate them into operation-specific
+/// structures according to the `clauses` argument of each definition deriving
+/// from `OpenMP_Op`.
+static bool genClauseOps(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  mlir::tblgen::NamespaceEmitter ns(os, "mlir::omp");
+  for (const Record *clause :
+       recordKeeper.getAllDerivedDefinitions("OpenMP_Clause"))
+    genClauseOpsStruct(clause, os);
+
+  // Produce base mixin class.
+  os << baseMixinClass;
+
+  for (const Record *op : recordKeeper.getAllDerivedDefinitions("OpenMP_Op"))
+    genOperandsDef(op, os);
+
+  return false;
+}
+
 // Registers the generator to mlir-tblgen.
 static mlir::GenRegistration
     verifyOpenmpOps("verify-openmp-ops",
                     "Verify OpenMP operations (produce no output file)",
                     verifyDecls);
+
+static mlir::GenRegistration
+    genOpenmpClauseOps("gen-openmp-clause-ops",
+                       "Generate OpenMP clause operand structures",
+                       genClauseOps);

From e50131aa068f74daa70d4135c92020aadae3af33 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Wed, 11 Sep 2024 13:20:59 +0200
Subject: [PATCH 083/114] [clang] Diagnose dangling issues for the
 "Container<GSLPointer>" case. (#107213)

This pull request enhances the GSL lifetime analysis to detect
situations where a dangling `Container<GSLPointer>` object is
constructed:

```cpp
std::vector<std::string_view> bad = {std::string()}; // dangling
```

The assignment case is not yet supported, but they will be addressed in
a follow-up.

Fixes #100526 (excluding the `push_back` case).
---
 clang/docs/ReleaseNotes.rst                   |  2 +
 clang/include/clang/Basic/AttrDocs.td         | 14 ++++
 clang/lib/Sema/CheckExprLifetime.cpp          | 42 +++++++---
 .../Sema/warn-lifetime-analysis-nocfg.cpp     | 77 +++++++++++++++++++
 4 files changed, 126 insertions(+), 9 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 250821a9f9c45..59ccdf1e15cd8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -298,6 +298,8 @@ Improvements to Clang's diagnostics
 
 - Clang now warns for u8 character literals used in C23 with ``-Wpre-c23-compat`` instead of ``-Wpre-c++17-compat``.
 
+- Clang now diagnoses cases where a dangling ``GSLOwner<GSLPointer>`` object is constructed, e.g. ``std::vector<string_view> v = {std::string()};`` (#GH100526).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 546e5100b79dd..9f72456d2da67 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -6690,6 +6690,20 @@ When the Owner's lifetime ends, it will consider the Pointer to be dangling.
     P.getInt(); // P is dangling
   }
 
+If a template class is annotated with ``[[gsl::Owner]]``, and the first
+instantiated template argument is a pointer type (raw pointer, or ``[[gsl::Pointer]]``),
+the analysis will consider the instantiated class as a container of the pointer.
+When constructing such an object from a GSL owner object, the analysis will
+assume that the container holds a pointer to the owner object. Consequently,
+when the owner object is destroyed, the pointer will be considered dangling.
+
+.. code-block:: c++
+
+   int f() {
+     std::vector<std::string_view> v = {std::string()}; // v holds a dangling pointer.
+     std::optional<std::string_view> o = std::string(); // o holds a dangling pointer.
+   }
+
 }];
 }
 
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index f1507ebb9a506..c8e703036c132 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -267,6 +267,26 @@ static bool isInStlNamespace(const Decl *D) {
   return DC->isStdNamespace();
 }
 
+// Returns true if the given Record decl is a form of `GSLOwner<Pointer>`
+// type, e.g. std::vector<string_view>, std::optional<string_view>.
+static bool isContainerOfPointer(const RecordDecl *Container) {
+  if (const auto *CTSD =
+          dyn_cast_if_present<ClassTemplateSpecializationDecl>(Container)) {
+    if (!CTSD->hasAttr<OwnerAttr>()) // Container must be a GSL owner type.
+      return false;
+    const auto &TAs = CTSD->getTemplateArgs();
+    return TAs.size() > 0 && TAs[0].getKind() == TemplateArgument::Type &&
+           (isRecordWithAttr<PointerAttr>(TAs[0].getAsType()) ||
+            TAs[0].getAsType()->isPointerType());
+  }
+  return false;
+}
+
+static bool isGSLOwner(QualType T) {
+  return isRecordWithAttr<OwnerAttr>(T) &&
+         !isContainerOfPointer(T->getAsRecordDecl());
+}
+
 static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
   if (auto *Conv = dyn_cast_or_null<CXXConversionDecl>(Callee))
     if (isRecordWithAttr<PointerAttr>(Conv->getConversionType()))
@@ -275,7 +295,7 @@ static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
     return false;
   if (!isRecordWithAttr<PointerAttr>(
           Callee->getFunctionObjectParameterType()) &&
-      !isRecordWithAttr<OwnerAttr>(Callee->getFunctionObjectParameterType()))
+      !isGSLOwner(Callee->getFunctionObjectParameterType()))
     return false;
   if (Callee->getReturnType()->isPointerType() ||
       isRecordWithAttr<PointerAttr>(Callee->getReturnType())) {
@@ -413,7 +433,7 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
     // Once we initialized a value with a non gsl-owner reference, it can no
     // longer dangle.
     if (ReturnType->isReferenceType() &&
-        !isRecordWithAttr<OwnerAttr>(ReturnType->getPointeeType())) {
+        !isGSLOwner(ReturnType->getPointeeType())) {
       for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) {
         if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit ||
             PE.Kind == IndirectLocalPathEntry::LifetimeBoundCall)
@@ -468,12 +488,17 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
     if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr<LifetimeBoundAttr>())
       VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]);
     else if (EnableGSLAnalysis && I == 0) {
+      // Perform GSL analysis for the first argument
       if (shouldTrackFirstArgument(Callee)) {
         VisitGSLPointerArg(Callee, Args[0]);
-      } else if (auto *CCE = dyn_cast<CXXConstructExpr>(Call);
-                 CCE &&
-                 CCE->getConstructor()->getParent()->hasAttr<PointerAttr>()) {
-        VisitGSLPointerArg(CCE->getConstructor(), Args[0]);
+      } else if (auto *Ctor = dyn_cast<CXXConstructExpr>(Call)) {
+        const auto *ClassD = Ctor->getConstructor()->getParent();
+        // Two cases:
+        //  a GSL pointer, e.g. std::string_view
+        //  a container of GSL pointer, e.g. std::vector<string_view>
+        if (ClassD->hasAttr<PointerAttr>() ||
+            (isContainerOfPointer(ClassD) && Callee->getNumParams() == 1))
+          VisitGSLPointerArg(Ctor->getConstructor(), Args[0]);
       }
     }
   }
@@ -990,13 +1015,12 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
         //   int &p = *localUniquePtr;
         //   someContainer.add(std::move(localUniquePtr));
         //   return p;
-        IsLocalGslOwner = isRecordWithAttr<OwnerAttr>(L->getType());
+        IsLocalGslOwner = isGSLOwner(L->getType());
         if (pathContainsInit(Path) || !IsLocalGslOwner)
           return false;
       } else {
         IsGslPtrValueFromGslTempOwner =
-            MTE && !MTE->getExtendingDecl() &&
-            isRecordWithAttr<OwnerAttr>(MTE->getType());
+            MTE && !MTE->getExtendingDecl() && isGSLOwner(MTE->getType());
         // Skipping a chain of initializing gsl::Pointer annotated objects.
         // We are looking only for the final source to find out if it was
         // a local or temporary owner or the address of a local variable/param.
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index 59357d0730a7d..234e06f069074 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -158,17 +158,30 @@ auto begin(C &c) -> decltype(c.begin());
 template<typename T, int N>
 T *begin(T (&array)[N]);
 
+using size_t = decltype(sizeof(0));
+
+template<typename T>
+struct initializer_list {
+  const T* ptr; size_t sz;
+};
 template <typename T>
 struct vector {
   typedef __gnu_cxx::basic_iterator<T> iterator;
   iterator begin();
   iterator end();
   const T *data() const;
+  vector();
+  vector(initializer_list<T> __l);
+  
+  template<typename InputIterator>
+	vector(InputIterator first, InputIterator __last);
+
   T &at(int n);
 };
 
 template<typename T>
 struct basic_string_view {
+  basic_string_view();
   basic_string_view(const T *);
   const T *begin() const;
 };
@@ -203,11 +216,21 @@ template<typename T>
 struct optional {
   optional();
   optional(const T&);
+
+  template<typename U = T>
+	optional(U&& t);
+
+  template<typename U>
+	optional(optional<U>&& __t);
+
   T &operator*() &;
   T &&operator*() &&;
   T &value() &;
   T &&value() &&;
 };
+template<typename T>
+optional<__decay(T)> make_optional(T&&);
+
 
 template<typename T>
 struct stack {
@@ -553,3 +576,57 @@ void test() {
   std::string_view svjkk1 = ReturnStringView(StrCat("bar", "x")); // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}}
 }
 } // namespace GH100549
+
+namespace GH100526 {
+void test() {
+  std::vector<std::string_view> v1({std::string()}); // expected-warning {{object backing the pointer will be destroyed at the end}}
+  std::vector<std::string_view> v2({
+    std::string(), // expected-warning {{object backing the pointer will be destroyed at the end}}
+    std::string_view()
+  });
+  std::vector<std::string_view> v3({
+    std::string_view(),
+    std::string()  // expected-warning {{object backing the pointer will be destroyed at the end}}
+  });
+
+  std::optional<std::string_view> o1 = std::string(); // expected-warning {{object backing the pointer}}
+
+  std::string s;
+  // This is a tricky use-after-free case, what it does:
+  //   1. make_optional creates a temporary "optional<string>"" object
+  //   2. the temporary object owns the underlying string which is copied from s.
+  //   3. the t3 object holds the view to the underlying string of the temporary object.
+  std::optional<std::string_view> o2 = std::make_optional(s); // expected-warning {{object backing the pointer}}
+  std::optional<std::string_view> o3 = std::optional<std::string>(s); // expected-warning {{object backing the pointer}}
+  std::optional<std::string_view> o4 = std::optional<std::string_view>(s); 
+
+  // FIXME: should work for assignment cases
+  v1 = {std::string()};
+  o1 = std::string();
+
+  // no warning on copying pointers.
+  std::vector<std::string_view> n1 = {std::string_view()};
+  std::optional<std::string_view> n2 = {std::string_view()};
+  std::optional<std::string_view> n3 = std::string_view();
+  std::optional<std::string_view> n4 = std::make_optional(std::string_view());
+  const char* b = "";
+  std::optional<std::string_view> n5 = std::make_optional(b);
+  std::optional<std::string_view> n6 = std::make_optional("test");
+}
+
+std::vector<std::string_view> test2(int i) {
+  std::vector<std::string_view> t;
+  if (i)
+    return t; // this is fine, no dangling
+  return std::vector<std::string_view>(t.begin(), t.end());
+}
+
+std::optional<std::string_view> test3(int i) {
+  std::string s;
+  std::string_view sv;
+  if (i)
+   return s; // expected-warning {{address of stack memory associated}}
+  return sv; // fine
+}
+
+} // namespace GH100526

From 334873fe2df27a4fa613e8744f29e502d3358397 Mon Sep 17 00:00:00 2001
From: Amy Wang <kai.ting.wang@huawei.com>
Date: Wed, 11 Sep 2024 07:37:35 -0400
Subject: [PATCH 084/114] [MLIR][Python] Python binding support for IntegerSet
 attribute (#107640)

Support IntegerSet attribute python binding.
---
 mlir/include/mlir-c/BuiltinAttributes.h   |  9 +++++++++
 mlir/lib/Bindings/Python/IRAttributes.cpp | 22 +++++++++++++++++++++-
 mlir/lib/CAPI/IR/BuiltinAttributes.cpp    |  9 +++++++++
 mlir/python/mlir/_mlir_libs/_mlir/ir.pyi  | 16 ++++++++++++++++
 mlir/python/mlir/ir.py                    |  5 +++++
 mlir/test/python/ir/attributes.py         | 18 ++++++++++++++++++
 6 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir-c/BuiltinAttributes.h b/mlir/include/mlir-c/BuiltinAttributes.h
index 231eb83b5e269..7c8c84e55b962 100644
--- a/mlir/include/mlir-c/BuiltinAttributes.h
+++ b/mlir/include/mlir-c/BuiltinAttributes.h
@@ -16,6 +16,7 @@
 
 #include "mlir-c/AffineMap.h"
 #include "mlir-c/IR.h"
+#include "mlir-c/IntegerSet.h"
 #include "mlir-c/Support.h"
 
 #ifdef __cplusplus
@@ -177,6 +178,14 @@ MLIR_CAPI_EXPORTED bool mlirBoolAttrGetValue(MlirAttribute attr);
 /// Checks whether the given attribute is an integer set attribute.
 MLIR_CAPI_EXPORTED bool mlirAttributeIsAIntegerSet(MlirAttribute attr);
 
+/// Creates an integer set attribute wrapping the given set. The attribute
+/// belongs to the same context as the integer set.
+MLIR_CAPI_EXPORTED MlirAttribute mlirIntegerSetAttrGet(MlirIntegerSet set);
+
+/// Returns the integer set wrapped in the given integer set attribute.
+MLIR_CAPI_EXPORTED MlirIntegerSet
+mlirIntegerSetAttrGetValue(MlirAttribute attr);
+
 /// Returns the typeID of an IntegerSet attribute.
 MLIR_CAPI_EXPORTED MlirTypeID mlirIntegerSetAttrGetTypeID(void);
 
diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp
index b4049bd7972d4..bfdd4a520af27 100644
--- a/mlir/lib/Bindings/Python/IRAttributes.cpp
+++ b/mlir/lib/Bindings/Python/IRAttributes.cpp
@@ -147,6 +147,26 @@ class PyAffineMapAttribute : public PyConcreteAttribute<PyAffineMapAttribute> {
   }
 };
 
+class PyIntegerSetAttribute
+    : public PyConcreteAttribute<PyIntegerSetAttribute> {
+public:
+  static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAIntegerSet;
+  static constexpr const char *pyClassName = "IntegerSetAttr";
+  using PyConcreteAttribute::PyConcreteAttribute;
+  static constexpr GetTypeIDFunctionTy getTypeIdFunction =
+      mlirIntegerSetAttrGetTypeID;
+
+  static void bindDerived(ClassTy &c) {
+    c.def_static(
+        "get",
+        [](PyIntegerSet &integerSet) {
+          MlirAttribute attr = mlirIntegerSetAttrGet(integerSet.get());
+          return PyIntegerSetAttribute(integerSet.getContext(), attr);
+        },
+        py::arg("integer_set"), "Gets an attribute wrapping an IntegerSet.");
+  }
+};
+
 template <typename T>
 static T pyTryCast(py::handle object) {
   try {
@@ -1426,7 +1446,6 @@ py::object symbolRefOrFlatSymbolRefAttributeCaster(PyAttribute &pyAttribute) {
 
 void mlir::python::populateIRAttributes(py::module &m) {
   PyAffineMapAttribute::bind(m);
-
   PyDenseBoolArrayAttribute::bind(m);
   PyDenseBoolArrayAttribute::PyDenseArrayIterator::bind(m);
   PyDenseI8ArrayAttribute::bind(m);
@@ -1466,6 +1485,7 @@ void mlir::python::populateIRAttributes(py::module &m) {
   PyOpaqueAttribute::bind(m);
   PyFloatAttribute::bind(m);
   PyIntegerAttribute::bind(m);
+  PyIntegerSetAttribute::bind(m);
   PyStringAttribute::bind(m);
   PyTypeAttribute::bind(m);
   PyGlobals::get().registerTypeCaster(
diff --git a/mlir/lib/CAPI/IR/BuiltinAttributes.cpp b/mlir/lib/CAPI/IR/BuiltinAttributes.cpp
index 726af884668b2..11d1ade552f5a 100644
--- a/mlir/lib/CAPI/IR/BuiltinAttributes.cpp
+++ b/mlir/lib/CAPI/IR/BuiltinAttributes.cpp
@@ -10,6 +10,7 @@
 #include "mlir-c/Support.h"
 #include "mlir/CAPI/AffineMap.h"
 #include "mlir/CAPI/IR.h"
+#include "mlir/CAPI/IntegerSet.h"
 #include "mlir/CAPI/Support.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/Attributes.h"
@@ -192,6 +193,14 @@ MlirTypeID mlirIntegerSetAttrGetTypeID(void) {
   return wrap(IntegerSetAttr::getTypeID());
 }
 
+MlirAttribute mlirIntegerSetAttrGet(MlirIntegerSet set) {
+  return wrap(IntegerSetAttr::get(unwrap(set)));
+}
+
+MlirIntegerSet mlirIntegerSetAttrGetValue(MlirAttribute attr) {
+  return wrap(llvm::cast<IntegerSetAttr>(unwrap(attr)).getValue());
+}
+
 //===----------------------------------------------------------------------===//
 // Opaque attribute.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index 7b4fac7275bfc..a3d3a92618696 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -138,6 +138,7 @@ __all__ = [
     "InsertionPoint",
     "IntegerAttr",
     "IntegerSet",
+    "IntegerSetAttr",
     "IntegerSetConstraint",
     "IntegerSetConstraintList",
     "IntegerType",
@@ -1905,6 +1906,21 @@ class IntegerSet:
     @property
     def n_symbols(self) -> int: ...
 
+class IntegerSetAttr(Attribute):
+    static_typeid: ClassVar[TypeID]
+    @staticmethod
+    def get(integer_set) -> IntegerSetAttr:
+        """
+        Gets an attribute wrapping an IntegerSet.
+        """
+    @staticmethod
+    def isinstance(other: Attribute) -> bool: ...
+    def __init__(self, cast_from_attr: Attribute) -> None: ...
+    @property
+    def type(self) -> Type: ...
+    @property
+    def typeid(self) -> TypeID: ...
+
 class IntegerSetConstraint:
     def __init__(self, *args, **kwargs) -> None: ...
     @property
diff --git a/mlir/python/mlir/ir.py b/mlir/python/mlir/ir.py
index a9ac765fe1c17..9a6ce462047ad 100644
--- a/mlir/python/mlir/ir.py
+++ b/mlir/python/mlir/ir.py
@@ -22,6 +22,11 @@ def _affineMapAttr(x, context):
     return AffineMapAttr.get(x)
 
 
+@register_attribute_builder("IntegerSetAttr")
+def _integerSetAttr(x, context):
+    return IntegerSetAttr.get(x)
+
+
 @register_attribute_builder("BoolAttr")
 def _boolAttr(x, context):
     return BoolAttr.get(x, context=context)
diff --git a/mlir/test/python/ir/attributes.py b/mlir/test/python/ir/attributes.py
index 4b475db634645..00c3e1b4decdb 100644
--- a/mlir/test/python/ir/attributes.py
+++ b/mlir/test/python/ir/attributes.py
@@ -162,6 +162,24 @@ def testAffineMapAttr():
         assert attr_built == attr_parsed
 
 
+# CHECK-LABEL: TEST: testIntegerSetAttr
+@run
+def testIntegerSetAttr():
+    with Context() as ctx:
+        d0 = AffineDimExpr.get(0)
+        d1 = AffineDimExpr.get(1)
+        s0 = AffineSymbolExpr.get(0)
+        c42 = AffineConstantExpr.get(42)
+        set0 = IntegerSet.get(2, 1, [d0 - d1, s0 - c42], [True, False])
+
+        # CHECK: affine_set<(d0, d1)[s0] : (d0 - d1 == 0, s0 - 42 >= 0)>
+        attr_built = IntegerSetAttr.get(set0)
+        print(str(attr_built))
+
+        attr_parsed = Attribute.parse(str(attr_built))
+        assert attr_built == attr_parsed
+
+
 # CHECK-LABEL: TEST: testFloatAttr
 @run
 def testFloatAttr():

From 7c25ae87f7378f38aa49a92b9cf8092deb95a1f4 Mon Sep 17 00:00:00 2001
From: Frederik Carlier <frederik.carlier@keysight.com>
Date: Wed, 11 Sep 2024 13:38:00 +0200
Subject: [PATCH 085/114] Set dllimport on Objective C ivar offsets (#107604)

Ensures that offsets for instance variables are marked with `dllimport`
if the interface to which they belong has this attribute.
---
 clang/lib/CodeGen/CGObjCGNU.cpp         | 11 +++++++++--
 clang/test/CodeGenObjC/dllstorage.m     |  4 ++--
 clang/test/SemaObjC/ivar-access-tests.m | 10 ++++++++++
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index adc7cdbfded88..6280e9465ecba 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -1699,11 +1699,18 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
   llvm::Value *EmitIvarOffset(CodeGenFunction &CGF,
                               const ObjCInterfaceDecl *Interface,
                               const ObjCIvarDecl *Ivar) override {
-    const std::string Name = GetIVarOffsetVariableName(Ivar->getContainingInterface(), Ivar);
+    const ObjCInterfaceDecl *ContainingInterface =
+        Ivar->getContainingInterface();
+    const std::string Name =
+        GetIVarOffsetVariableName(ContainingInterface, Ivar);
     llvm::GlobalVariable *IvarOffsetPointer = TheModule.getNamedGlobal(Name);
-    if (!IvarOffsetPointer)
+    if (!IvarOffsetPointer) {
       IvarOffsetPointer = new llvm::GlobalVariable(TheModule, IntTy, false,
               llvm::GlobalValue::ExternalLinkage, nullptr, Name);
+      if (Ivar->getAccessControl() != ObjCIvarDecl::Private &&
+          Ivar->getAccessControl() != ObjCIvarDecl::Package)
+        CGM.setGVProperties(IvarOffsetPointer, ContainingInterface);
+    }
     CharUnits Align = CGM.getIntAlign();
     llvm::Value *Offset =
         CGF.Builder.CreateAlignedLoad(IntTy, IvarOffsetPointer, Align);
diff --git a/clang/test/CodeGenObjC/dllstorage.m b/clang/test/CodeGenObjC/dllstorage.m
index c94f4c9b5804d..a6c591b2d7930 100644
--- a/clang/test/CodeGenObjC/dllstorage.m
+++ b/clang/test/CodeGenObjC/dllstorage.m
@@ -112,7 +112,7 @@ @interface M : I {
 // CHECK-IR-DAG: @"OBJC_IVAR_$_M._ivar" = external dllimport global i32
 
 // CHECK-NF-DAG: @"$_OBJC_REF_CLASS_M" = external dllimport global ptr
-// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external global i32
+// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external dllimport global i32
 
 __declspec(dllexport)
 __attribute__((__objc_exception__))
@@ -151,7 +151,7 @@ id f(Q *q) {
 
 // CHECK-IR-DAG: @"OBJC_IVAR_$_M._ivar" = external dllimport global i32
 
-// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external global i32
+// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external dllimport global i32
 
 int g(void) {
   @autoreleasepool {
diff --git a/clang/test/SemaObjC/ivar-access-tests.m b/clang/test/SemaObjC/ivar-access-tests.m
index cd7e09d406ada..6060dea5ab0f0 100644
--- a/clang/test/SemaObjC/ivar-access-tests.m
+++ b/clang/test/SemaObjC/ivar-access-tests.m
@@ -2,6 +2,8 @@
 
 @interface MySuperClass
 {
+  int unmarked;
+
 @private
   int private;
 
@@ -17,6 +19,7 @@ @implementation MySuperClass
 - (void) test {
     int access;
     MySuperClass *s = 0;
+    access = s->unmarked;
     access = s->private;   
     access = s->protected;
 }
@@ -30,9 +33,11 @@ @implementation MyClass
 - (void) test {
     int access;
     MySuperClass *s = 0;
+    access = s->unmarked;
     access = s->private; // expected-error {{instance variable 'private' is private}}
     access = s->protected;
     MyClass *m=0;
+    access = m->unmarked;
     access = m->private; // expected-error {{instance variable 'private' is private}}
     access = m->protected;
 }
@@ -46,9 +51,11 @@ @implementation Deeper
 - (void) test {
     int access;
     MySuperClass *s = 0;
+    access = s->unmarked;
     access = s->private; // expected-error {{instance variable 'private' is private}}
     access = s->protected;
     MyClass *m=0;
+    access = m->unmarked;
     access = m->private; // expected-error {{instance variable 'private' is private}}
     access = m->protected;
 }
@@ -61,9 +68,11 @@ @implementation Unrelated
 - (void) test {
     int access;
     MySuperClass *s = 0;
+    access = s->unmarked; // expected-error {{instance variable 'unmarked' is protected}}
     access = s->private; // expected-error {{instance variable 'private' is private}}
     access = s->protected; // expected-error {{instance variable 'protected' is protected}}
     MyClass *m=0;
+    access = m->unmarked; // expected-error {{instance variable 'unmarked' is protected}}
     access = m->private; // expected-error {{instance variable 'private' is private}}
     access = m->protected; // expected-error {{instance variable 'protected' is protected}}
 }
@@ -73,6 +82,7 @@ int main (void)
 {
   MySuperClass *s = 0;
   int access;
+  access = s->unmarked; // expected-error {{instance variable 'unmarked' is protected}}
   access = s->private;   // expected-error {{instance variable 'private' is private}}
   access = s->protected; // expected-error {{instance variable 'protected' is protected}}
   return 0;

From b35bb7b797e81e1d972c8e6d60e20e39c1917b99 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Wed, 11 Sep 2024 19:46:26 +0800
Subject: [PATCH 086/114] [mlir] Fix 'StringSet' may not intend to support
 class template argument deduction (NFC)

/llvm-project/mlir/tools/mlir-tblgen/OmpOpGen.cpp:202:3:
error: 'StringSet' may not intend to support class template argument deduction [-Werror,-Wctad-maybe-unsupported]
  llvm::StringSet superClasses;
  ^
/llvm-project/llvm/include/llvm/ADT/StringSet.h:23:7: note: add a deduction guide to suppress this warning
class StringSet : public StringMap<std::nullopt_t, AllocatorTy> {
      ^
---
 mlir/tools/mlir-tblgen/OmpOpGen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
index f546e1b1b6691..c9d25a5dee5cd 100644
--- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp
+++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
@@ -199,7 +199,7 @@ static StringRef translateArgumentType(ArrayRef<SMLoc> loc, StringInit *name,
                                        Init *init, int &nest, int &rank) {
   Record *def = cast<DefInit>(init)->getDef();
 
-  llvm::StringSet superClasses;
+  llvm::StringSet<> superClasses;
   for (auto [sc, _] : def->getSuperClasses())
     superClasses.insert(sc->getNameInitAsString());
 

From 0856f12bb0a9829a282bef7c26ad536ff3b1e0a5 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Wed, 11 Sep 2024 19:59:00 +0800
Subject: [PATCH 087/114] [mlir] Fix -Wunused-variable in OmpOpGen.cpp (NFC)

/llvm-project/mlir/tools/mlir-tblgen/OmpOpGen.cpp:239:8:
error: unused variable 'isAttr' [-Werror,-Wunused-variable]
  bool isAttr = superClasses.contains("Attr");
       ^
---
 mlir/tools/mlir-tblgen/OmpOpGen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
index c9d25a5dee5cd..23368c56bee8c 100644
--- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp
+++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
@@ -236,7 +236,7 @@ static StringRef translateArgumentType(ArrayRef<SMLoc> loc, StringInit *name,
   }
 
   // Handle simple attribute and value types.
-  bool isAttr = superClasses.contains("Attr");
+  [[maybe_unused]] bool isAttr = superClasses.contains("Attr");
   bool isValue = superClasses.contains("TypeConstraint");
   if (superClasses.contains("Variadic"))
     ++nest;

From ed22029eea12b37c2a58f2c6b8d67f12009102a0 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy <vyacheslav.levytskyy@intel.com>
Date: Wed, 11 Sep 2024 14:18:14 +0200
Subject: [PATCH 088/114] [SPIR-V] Address the case when optimization uses GEP
 operator and GenCode creates G_PTR_ADD to convey the semantics (#107880)

When running SPIR-V Backend with optimization levels higher than 0, we
observe GEP Operator's as a new factor, massively used to convey the
semantics of the original LLVM IR. Previously, an issue related to GEP
Operator was mentioned and fixed on the consumer side of toolchains
(see, for example, Khronos Trandslator Issue
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/issues/2486 and PR
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/pull/2487).
However, there is a case when GenCode creates G_PTR_ADD to convey the
original semantics under optimization levels higher than 0 where it's
SPIR-V Backend that fails to translate source LLVM IR correctly.

Consider the following reproducer:

```
%struct = type { i32, [257 x i8], [257 x i8], [129 x i8], i32, i64, i64, i64, i64, i64, i64 }
@Mem = linkonce_odr dso_local addrspace(1) global %struct zeroinitializer, align 8

define weak dso_local spir_func void @__devicelib_assert_fail(ptr addrspace(4) noundef %expr, i32 noundef %line, i1 %fl) {
entry:
  %cmp = icmp eq i32 %line, 0
  br i1 %cmp, label %lbl, label %exit

lbl:
  store i32 %line, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @Mem, i64 648), align 8
  br i1 %fl, label %lbl, label %exit

exit:
  ret void
}
```

converted to the following machine instructions by SPIR-V Backend:

```
  %4:type(s64) = OpTypeInt 32, 0
  %22:type(s64) = OpTypePointer 5, %4:type(s64)
  %2:type(s64) = OpTypeInt 8, 0
  %28:type(s64) = OpTypePointer 5, %2:type(s64)

  %10:pid(p1) = G_GLOBAL_VALUE @Mem

  %36:type(s64) = OpTypeStruct %4:type(s64), %32:type(s64), %32:type(s64), %34:type(s64), %4:type(s64), %35:type(s64), %35:type(s64), %35:type(s64), %35:type(s64), %35:type(s64), %35:type(s64)
  %37:iid(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.const.composite)

  %8:iid(s32) = ASSIGN_TYPE %37:iid(s32), %36:type(s64)
  G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.init.global), %10:pid(p1), %8:iid(s32)

  %29:pid(p1) = nuw G_PTR_ADD %10:pid, %16:iid(s64)
  %15:pid(p1) = nuw ASSIGN_TYPE %29:pid(p1), %28:type(s64)

  %27:pid(p2) = G_BITCAST %15:pid(p1)
  %17:pid(p2) = ASSIGN_TYPE %27:pid(p2), %22:type(s64)
  G_STORE %1:iid(s32), %17:pid(p2) :: (store (s32) into %ir.3, align 8, addrspace 1)
```

On the next stage of instruction selection this `G_PTR_ADD`-related
pattern would be interpreted as an initialization of a global variable
and converted to an invalid constant GEP pattern that, in its turn,
would fail to be verified by LLVM during back translation from SPIR-V to
LLVM IR.

This PR introduces a fix for the problem by adding one more case of
`G_PTR_ADD` translation, when we use a non-const GEP to convey the
meaning. The reproducer is attached as a new test case.
---
 .../Target/SPIRV/SPIRVDuplicatesTracker.cpp   | 14 ++++
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 67 +++++++++++++++++--
 .../lib/Target/SPIRV/SPIRVSymbolicOperands.td |  1 +
 .../CodeGen/SPIRV/opt-gepoperator-of-gvar.ll  | 64 ++++++++++++++++++
 4 files changed, 141 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
index 7c32bb1968ef5..832ca0ba5a82d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
@@ -13,6 +13,8 @@
 
 #include "SPIRVDuplicatesTracker.h"
 
+#define DEBUG_TYPE "build-dep-graph"
+
 using namespace llvm;
 
 template <typename T>
@@ -63,6 +65,18 @@ void SPIRVGeneralDuplicatesTracker::buildDepsGraph(
         if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL && i == 2)
           continue;
         MachineOperand *RegOp = &VRegDef->getOperand(0);
+        LLVM_DEBUG({
+          if (Reg2Entry.count(RegOp) == 0 &&
+              (MI->getOpcode() != SPIRV::OpVariable || i != 3)) {
+            dbgs() << "Unexpected pattern while building a dependency "
+                      "graph.\nInstruction: ";
+            MI->print(dbgs());
+            dbgs() << "Operand: ";
+            Op.print(dbgs());
+            dbgs() << "\nOperand definition: ";
+            VRegDef->print(dbgs());
+          }
+        });
         assert((MI->getOpcode() == SPIRV::OpVariable && i == 3) ||
                Reg2Entry.count(RegOp));
         if (Reg2Entry.count(RegOp))
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 1e861da35aaac..831d7f76ac14c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -607,10 +607,7 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
   case TargetOpcode::G_ADDRSPACE_CAST:
     return selectAddrSpaceCast(ResVReg, ResType, I);
   case TargetOpcode::G_PTR_ADD: {
-    // Currently, we get G_PTR_ADD only as a result of translating
-    // global variables, initialized with constant expressions like GV + Const
-    // (see test opencl/basic/progvar_prog_scope_init.ll).
-    // TODO: extend the handler once we have other cases.
+    // Currently, we get G_PTR_ADD only applied to global variables.
     assert(I.getOperand(1).isReg() && I.getOperand(2).isReg());
     Register GV = I.getOperand(1).getReg();
     MachineRegisterInfo::def_instr_iterator II = MRI->def_instr_begin(GV);
@@ -619,8 +616,68 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
             (*II).getOpcode() == TargetOpcode::COPY ||
             (*II).getOpcode() == SPIRV::OpVariable) &&
            isImm(I.getOperand(2), MRI));
-    Register Idx = buildZerosVal(GR.getOrCreateSPIRVIntegerType(32, I, TII), I);
+    // It may be the initialization of a global variable.
+    bool IsGVInit = false;
+    for (MachineRegisterInfo::use_instr_iterator
+             UseIt = MRI->use_instr_begin(I.getOperand(0).getReg()),
+             UseEnd = MRI->use_instr_end();
+         UseIt != UseEnd; UseIt = std::next(UseIt)) {
+      if ((*UseIt).getOpcode() == TargetOpcode::G_GLOBAL_VALUE ||
+          (*UseIt).getOpcode() == SPIRV::OpVariable) {
+        IsGVInit = true;
+        break;
+      }
+    }
     MachineBasicBlock &BB = *I.getParent();
+    if (!IsGVInit) {
+      SPIRVType *GVType = GR.getSPIRVTypeForVReg(GV);
+      SPIRVType *GVPointeeType = GR.getPointeeType(GVType);
+      SPIRVType *ResPointeeType = GR.getPointeeType(ResType);
+      if (GVPointeeType && ResPointeeType && GVPointeeType != ResPointeeType) {
+        // Build a new virtual register that is associated with the required
+        // data type.
+        Register NewVReg = MRI->createGenericVirtualRegister(MRI->getType(GV));
+        MRI->setRegClass(NewVReg, MRI->getRegClass(GV));
+        //  Having a correctly typed base we are ready to build the actually
+        //  required GEP. It may not be a constant though, because all Operands
+        //  of OpSpecConstantOp is to originate from other const instructions,
+        //  and only the AccessChain named opcodes accept a global OpVariable
+        //  instruction. We can't use an AccessChain opcode because of the type
+        //  mismatch between result and base types.
+        if (!GR.isBitcastCompatible(ResType, GVType))
+          report_fatal_error(
+              "incompatible result and operand types in a bitcast");
+        Register ResTypeReg = GR.getSPIRVTypeID(ResType);
+        MachineInstrBuilder MIB =
+            BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpBitcast))
+                .addDef(NewVReg)
+                .addUse(ResTypeReg)
+                .addUse(GV);
+        return MIB.constrainAllUses(TII, TRI, RBI) &&
+               BuildMI(BB, I, I.getDebugLoc(),
+                       TII.get(STI.isVulkanEnv()
+                                   ? SPIRV::OpInBoundsAccessChain
+                                   : SPIRV::OpInBoundsPtrAccessChain))
+                   .addDef(ResVReg)
+                   .addUse(ResTypeReg)
+                   .addUse(NewVReg)
+                   .addUse(I.getOperand(2).getReg())
+                   .constrainAllUses(TII, TRI, RBI);
+      } else {
+        return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp))
+            .addDef(ResVReg)
+            .addUse(GR.getSPIRVTypeID(ResType))
+            .addImm(
+                static_cast<uint32_t>(SPIRV::Opcode::InBoundsPtrAccessChain))
+            .addUse(GV)
+            .addUse(I.getOperand(2).getReg())
+            .constrainAllUses(TII, TRI, RBI);
+      }
+    }
+    // It's possible to translate G_PTR_ADD to OpSpecConstantOp: either to
+    // initialize a global variable with a constant expression (e.g., the test
+    // case opencl/basic/progvar_prog_scope_init.ll), or for another use case
+    Register Idx = buildZerosVal(GR.getOrCreateSPIRVIntegerType(32, I, TII), I);
     auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp))
                    .addDef(ResVReg)
                    .addUse(GR.getSPIRVTypeID(ResType))
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 96601dd8796c6..23cd32eff45d5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -1628,6 +1628,7 @@ multiclass OpcodeOperand<bits<32> value> {
   defm : SymbolicOperandWithRequirements<OpcodeOperand, value, NAME, 0, 0, [], []>;
 }
 // TODO: implement other mnemonics.
+defm InBoundsAccessChain : OpcodeOperand<66>;
 defm InBoundsPtrAccessChain : OpcodeOperand<70>;
 defm PtrCastToGeneric : OpcodeOperand<121>;
 defm Bitcast : OpcodeOperand<124>;
diff --git a/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll
new file mode 100644
index 0000000000000..5f9229f5a5bd6
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll
@@ -0,0 +1,64 @@
+; RUN: llc -O2 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O2 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O2 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O2 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#Char:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#PtrChar:]] = OpTypePointer CrossWorkgroup %[[#Char]]
+; CHECK-DAG: %[[#Int:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#PtrInt:]] = OpTypePointer CrossWorkgroup %[[#Int]]
+; CHECK-DAG: %[[#C648:]] = OpConstant %[[#]] 648
+; CHECK-DAG: %[[#Struct:]] = OpTypeStruct %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]]
+; CHECK-DAG: %[[#VarInit:]] = OpConstantNull %[[#Struct]]
+; CHECK-DAG: %[[#PtrStruct:]] = OpTypePointer CrossWorkgroup %[[#Struct]]
+; CHECK-DAG: %[[#Var:]] = OpVariable %[[#PtrStruct]] CrossWorkgroup %[[#VarInit]]
+; CHECK-DAG: %[[#Bytes:]] = OpVariable %[[#PtrChar]] CrossWorkgroup %[[#]]
+; CHECK-DAG: %[[#BytesGEP:]] = OpSpecConstantOp %[[#PtrChar]] 70 %[[#Bytes]] %[[#C648]]
+
+; CHECK: OpFunction
+; CHECK: %[[#]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#Line:]] = OpFunctionParameter %[[#Int]]
+; CHECK: %[[#]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#Casted:]] = OpBitcast %[[#PtrChar]] %[[#Var]]
+; CHECK: %[[#AddrChar:]] = OpInBoundsPtrAccessChain %[[#PtrChar]] %[[#Casted]] %[[#C648]]
+; CHECK: %[[#AddrInt:]] = OpBitcast %[[#PtrInt]] %[[#AddrChar]]
+; CHECK: OpStore %[[#AddrInt]] %[[#Line]]
+
+%struct = type { i32, [257 x i8], [257 x i8], [129 x i8], i32, i64, i64, i64, i64, i64, i64 }
+@Mem = linkonce_odr dso_local addrspace(1) global %struct zeroinitializer, align 8
+
+define weak dso_local spir_func void @foo(ptr addrspace(4) noundef %expr, i32 noundef %line, i1 %fl) {
+entry:
+  %cmp = icmp eq i32 %line, 0
+  br i1 %cmp, label %lbl, label %exit
+
+lbl:
+  store i32 %line, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @Mem, i64 648), align 8
+  br i1 %fl, label %lbl, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[#]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#Line2:]] = OpFunctionParameter %[[#Int]]
+; CHECK: %[[#]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#AddrInt2:]] = OpBitcast %[[#PtrInt]] %[[#BytesGEP]]
+; CHECK: OpStore %[[#AddrInt2]] %[[#Line2]]
+
+@Bytes = linkonce_odr dso_local addrspace(1) global i8 zeroinitializer, align 8
+
+define weak dso_local spir_func void @bar(ptr addrspace(4) noundef %expr, i32 noundef %line, i1 %fl) {
+entry:
+  %cmp = icmp eq i32 %line, 0
+  br i1 %cmp, label %lbl, label %exit
+
+lbl:
+  store i32 %line, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @Bytes, i64 648), align 8
+  br i1 %fl, label %lbl, label %exit
+
+exit:
+  ret void
+}

From 1b0400eed8613108d9f293b9ddd3380e3241ac60 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 11 Sep 2024 13:07:47 +0100
Subject: [PATCH 089/114] [X86] combineSubABS - handle NEG(ABD()) expanded
 patterns

combineSubABS already handles the "(sub Y, cmovns X, -X) -> (add Y, cmovns -X, X)" fold by flipping the cmov operands.

We can do something similar for the negation of ABDS/U patterns which have been expanded to a CMOVL/CMOVB with a pair of commuted subtractions: "NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y))"
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 56 +++++++++++-------
 llvm/test/CodeGen/X86/abds-neg.ll       | 75 +++++++++----------------
 llvm/test/CodeGen/X86/abdu-neg.ll       | 57 +++++++------------
 3 files changed, 80 insertions(+), 108 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a1d466eee691c..d0794cb9bfde3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56096,34 +56096,50 @@ static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
   if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
     return SDValue();
 
-  X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
-  if (CC != X86::COND_S && CC != X86::COND_NS)
-    return SDValue();
-
-  // Condition should come from a negate operation.
   SDValue Cond = N1.getOperand(3);
-  if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
+  if (Cond.getOpcode() != X86ISD::SUB)
     return SDValue();
   assert(Cond.getResNo() == 1 && "Unexpected result number");
 
-  // Get the X and -X from the negate.
-  SDValue NegX = Cond.getValue(0);
-  SDValue X = Cond.getOperand(1);
-
   SDValue FalseOp = N1.getOperand(0);
   SDValue TrueOp = N1.getOperand(1);
+  X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
+  MVT VT = N->getSimpleValueType(0);
+  SDLoc DL(N);
 
-  // Cmov operands should be X and NegX. Order doesn't matter.
-  if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
-    return SDValue();
+  // ABS condition should come from a negate operation.
+  if ((CC == X86::COND_S || CC == X86::COND_NS) &&
+      isNullConstant(Cond.getOperand(0))) {
+    // Get the X and -X from the negate.
+    SDValue NegX = Cond.getValue(0);
+    SDValue X = Cond.getOperand(1);
 
-  // Build a new CMOV with the operands swapped.
-  SDLoc DL(N);
-  MVT VT = N->getSimpleValueType(0);
-  SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
-                             N1.getOperand(2), Cond);
-  // Convert sub to add.
-  return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
+    // Cmov operands should be X and NegX. Order doesn't matter.
+    if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
+      return SDValue();
+
+    // Build a new CMOV with the operands swapped.
+    SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
+                               N1.getOperand(2), Cond);
+    // Convert sub to add.
+    return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
+  }
+
+  // Handle ABD special case:
+  // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
+  // ABD condition should come from a pair of matching subtracts.
+  if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
+      (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
+      (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
+      (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
+      (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
+      (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
+    // Build a new CMOV with the operands swapped.
+    return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
+                       Cond);
+  }
+
+  return SDValue();
 }
 
 static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index b9b3436dd1ed9..6e22d855dc831 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -112,8 +112,7 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -144,8 +143,7 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -176,8 +174,7 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -208,8 +205,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32:
@@ -217,8 +213,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %aext = sext i32 %a to i64
   %bext = sext i32 %b to i64
@@ -237,8 +232,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32_i16:
@@ -247,8 +241,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    subl %eax, %ecx
 ; X64-NEXT:    subl %edi, %eax
-; X64-NEXT:    cmovll %ecx, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovgel %ecx, %eax
 ; X64-NEXT:    retq
   %aext = sext i32 %a to i64
   %bext = sext i16 %b to i64
@@ -267,8 +260,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32_undef:
@@ -276,8 +268,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %aext = sext i32 %a to i64
   %bext = sext i32 %b to i64
@@ -319,8 +310,7 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %aext = sext i64 %a to i128
   %bext = sext i64 %b to i128
@@ -362,8 +352,7 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %aext = sext i64 %a to i128
   %bext = sext i64 %b to i128
@@ -558,8 +547,7 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -587,8 +575,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_minmax_i32:
@@ -596,8 +583,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %min = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.smax.i32(i32 %a, i32 %b)
@@ -641,8 +627,7 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %min = call i64 @llvm.smin.i64(i64 %a, i64 %b)
   %max = call i64 @llvm.smax.i64(i64 %a, i64 %b)
@@ -776,8 +761,7 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -806,8 +790,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i32:
@@ -815,8 +798,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %cmp = icmp sge i32 %a, %b
   %ab = sub i32 %a, %b
@@ -853,8 +835,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %cmp = icmp slt i64 %a, %b
   %ab = sub i64 %a, %b
@@ -1031,8 +1012,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_subnsw_i32:
@@ -1040,8 +1020,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub nsw i32 %a, %b
   %abs = call i32 @llvm.abs.i32(i32 %sub, i1 false)
@@ -1057,8 +1036,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_subnsw_i32_undef:
@@ -1066,8 +1044,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovgel %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub nsw i32 %a, %b
   %abs = call i32 @llvm.abs.i32(i32 %sub, i1 true)
@@ -1098,8 +1075,7 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = sub nsw i64 %a, %b
   %abs = call i64 @llvm.abs.i64(i64 %sub, i1 false)
@@ -1130,8 +1106,7 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = sub nsw i64 %a, %b
   %abs = call i64 @llvm.abs.i64(i64 %sub, i1 true)
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 1ded7e79e2510..6bda99c89a37e 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -112,8 +112,7 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -144,8 +143,7 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -176,8 +174,7 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -208,8 +205,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32:
@@ -217,8 +213,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovael %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovbl %esi, %eax
 ; X64-NEXT:    retq
   %aext = zext i32 %a to i64
   %bext = zext i32 %b to i64
@@ -237,8 +232,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32_i16:
@@ -247,8 +241,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    subl %eax, %ecx
 ; X64-NEXT:    subl %edi, %eax
-; X64-NEXT:    cmovbl %ecx, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovael %ecx, %eax
 ; X64-NEXT:    retq
   %aext = zext i32 %a to i64
   %bext = zext i16 %b to i64
@@ -267,8 +260,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_ext_i32_undef:
@@ -276,8 +268,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovael %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovbl %esi, %eax
 ; X64-NEXT:    retq
   %aext = zext i32 %a to i64
   %bext = zext i32 %b to i64
@@ -313,8 +304,7 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovaeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovbq %rsi, %rax
 ; X64-NEXT:    retq
   %aext = zext i64 %a to i128
   %bext = zext i64 %b to i128
@@ -350,8 +340,7 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovaeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovbq %rsi, %rax
 ; X64-NEXT:    retq
   %aext = zext i64 %a to i128
   %bext = zext i64 %b to i128
@@ -540,8 +529,7 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -569,8 +557,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_minmax_i32:
@@ -578,8 +565,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovael %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovbl %esi, %eax
 ; X64-NEXT:    retq
   %min = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %a, i32 %b)
@@ -623,8 +609,7 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovaeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovbq %rsi, %rax
 ; X64-NEXT:    retq
   %min = call i64 @llvm.umin.i64(i64 %a, i64 %b)
   %max = call i64 @llvm.umax.i64(i64 %a, i64 %b)
@@ -758,8 +743,7 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -788,8 +772,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovael %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i32:
@@ -797,8 +780,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    cmovael %esi, %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovbl %esi, %eax
 ; X64-NEXT:    retq
   %cmp = icmp uge i32 %a, %b
   %ab = sub i32 %a, %b
@@ -832,8 +814,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovaeq %rsi, %rax
-; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovbq %rsi, %rax
 ; X64-NEXT:    retq
   %cmp = icmp ult i64 %a, %b
   %ab = sub i64 %a, %b

From b9c2e2e3e910f8283f52c574fd8b6a7981d6cb0d Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Wed, 11 Sep 2024 14:18:54 +0200
Subject: [PATCH 090/114] [bazel] port 2f3d061918ece414d6db544a34b2e44a9950bc23

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index b43bdb7b5f471..c931898ed98e3 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10516,6 +10516,10 @@ gentbl_cc_library(
             ["-gen-op-doc"],
             "g3doc/Dialects/OpenMP/OpenMPOps.md",
         ),
+        (
+            ["-gen-openmp-clause-ops"],
+            "include/mlir/Dialect/OpenMP/OpenMPClauseOps.h.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/OpenMP/OpenMPOps.td",

From 80fcab8c26129a98f01ce4f8d9cc90f3653bf693 Mon Sep 17 00:00:00 2001
From: MichelleCDjunaidi <michellechrisalyn@gmail.com>
Date: Wed, 11 Sep 2024 22:32:01 +1000
Subject: [PATCH 091/114] [Docs][clang-query] disclose Windows linetab bug on
 clang-query tab auto-complete (#107956)

As per
https://github.com/llvm/llvm-project/pull/106672/#issuecomment-2325577815
and https://github.com/llvm/llvm-project/issues/107377, the
documentation should be updated to note that the current bug on Windows
involving ``LineEditor`` causing Tab key related features to not work.

Fixes #107377
---
 .../docs/clang-tidy/Contributing.rst          | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst
index d5303418b859b..ff8b05ff263c1 100644
--- a/clang-tools-extra/docs/clang-tidy/Contributing.rst
+++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst
@@ -344,18 +344,20 @@ matching expressions to simplify your matcher.
   clang-query> let c1 cxxRecordDecl()
   clang-query> match c1
 
-Alternatively, pressing the tab key after a previous matcher's open parentheses would also 
-show which matchers can be chained with the previous matcher, though some matchers that work 
-may not be listed.
-
-Just like breaking up a huge function into smaller chunks with intention-revealing names 
-can help you understand a complex algorithm, breaking up a matcher into smaller matchers 
-with intention-revealing names can help you understand a complicated matcher.  
-
-Once you have a working clang-query matcher, the C++ API matchers will be the same or similar 
-to your interactively constructed matcher (there can be cases where they differ slightly). 
-You can use local variables to preserve your intention-revealing names that you applied 
-to nested matchers.
+Alternatively, pressing the tab key after a previous matcher's open parentheses 
+would also show which matchers can be chained with the previous matcher, 
+though some matchers that work may not be listed. Note that tab completion 
+does not currently work on Windows.
+
+Just like breaking up a huge function into smaller chunks with 
+intention-revealing names can help you understand a complex algorithm, breaking 
+up a matcher into smaller matchers with intention-revealing names can help 
+you understand a complicated matcher.  
+
+Once you have a working :program:`clang-query` matcher, the C++ API matchers 
+will be the same or similar to your interactively constructed matcher (there 
+can be cases where they differ slightly). You can use local variables to preserve 
+your intention-revealing names that you applied to nested matchers.
 
 Creating private matchers
 ^^^^^^^^^^^^^^^^^^^^^^^^^

From 49b57df16144450331b4f90332d6918168b2a306 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 11 Sep 2024 14:38:56 +0200
Subject: [PATCH 092/114] DXIL: Use correct type ID when writing
 ValueAsMetadata. (#94337)

When emitting references to functions as part of `ValueAsMetadata`,
we currently emit the incorrect (typed) pointer, resulting in
crashes during deserialization. Avoid this by correctly mapping the
type during serialization.
---
 .../Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp    |  2 +-
 llvm/test/tools/dxil-dis/metadata.ll                   | 10 +++++++++-
 llvm/tools/dxil-dis/CMakeLists.txt                     |  4 +++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index cd0d6d34e9a67..45aadac861946 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1345,7 +1345,7 @@ void DXILBitcodeWriter::writeValueAsMetadata(
     Ty = TypedPointerType::get(F->getFunctionType(), F->getAddressSpace());
   else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     Ty = TypedPointerType::get(GV->getValueType(), GV->getAddressSpace());
-  Record.push_back(getTypeID(Ty));
+  Record.push_back(getTypeID(Ty, V));
   Record.push_back(VE.getValueID(V));
   Stream.EmitRecord(bitc::METADATA_VALUE, Record, 0);
   Record.clear();
diff --git a/llvm/test/tools/dxil-dis/metadata.ll b/llvm/test/tools/dxil-dis/metadata.ll
index 758860a2deb8f..18f2530ab8fc2 100644
--- a/llvm/test/tools/dxil-dis/metadata.ll
+++ b/llvm/test/tools/dxil-dis/metadata.ll
@@ -1,13 +1,21 @@
-; RUN: llc --filetype=obj %s -o - | dxil-dis 
+; RUN: llc --filetype=obj %s -o - | dxil-dis
 target triple = "dxil-unknown-shadermodel6.7-library"
 
+define void @kernel(ptr addrspace(1)) {
+    ret void
+}
+
 !llvm.foo = !{!0}
 !llvm.bar = !{!1}
+!llvm.baz = !{!2}
 
 !0 = !{i32 42}
 !1 = !{!"Some MDString"}
+!2 = !{ptr @kernel}
 
 ; CHECK: !llvm.foo = !{!0}
 ; CHECK: !llvm.bar = !{!1}
+; CHECK: !llvm.baz = !{!2}
 ; CHECK: !0 = !{i32 42}
 ; CHECK: !1 = !{!"Some MDString"}
+; CHECK: !2 = !{void (i8 addrspace(1)*)* @kernel}
diff --git a/llvm/tools/dxil-dis/CMakeLists.txt b/llvm/tools/dxil-dis/CMakeLists.txt
index 9addf108a8614..d0541fcf802e9 100644
--- a/llvm/tools/dxil-dis/CMakeLists.txt
+++ b/llvm/tools/dxil-dis/CMakeLists.txt
@@ -25,7 +25,9 @@ include(ExternalProject)
 
 set(SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/DXC-src)
 set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/DXC-bins)
-set(GIT_SETTINGS GIT_REPOSITORY https://github.com/microsoft/DirectXShaderCompiler.git)
+set(GIT_SETTINGS
+    GIT_REPOSITORY https://github.com/microsoft/DirectXShaderCompiler.git
+    GIT_TAG main)
 
 if (DXC_SOURCE_DIR)
   set(SOURCE_DIR ${DXC_SOURCE_DIR})

From 99a235499337aff27d087c31916d6785d2e9a263 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Wed, 11 Sep 2024 14:46:40 +0200
Subject: [PATCH 093/114] [LLD][COFF] Add support for ARM64EC import call
 thunks. (#107931)

These thunks can be accessed using `__impchk_*` symbols, though they
are typically not called directly. Instead, they are used to populate the
auxiliary IAT. When the imported function is x86_64 (or an ARM64EC
function with a patched export thunk), the thunk is used to call it.
Otherwise, the OS may replace the thunk at runtime with a direct
pointer to the ARM64EC function to avoid the overhead.
---
 lld/COFF/Chunks.cpp                       | 22 ++++++
 lld/COFF/Chunks.h                         | 24 +++++++
 lld/COFF/Config.h                         |  1 +
 lld/COFF/Driver.cpp                       |  7 +-
 lld/COFF/Driver.h                         |  2 +
 lld/COFF/InputFiles.cpp                   |  7 ++
 lld/COFF/InputFiles.h                     |  2 +
 lld/COFF/MarkLive.cpp                     | 22 ++++--
 lld/COFF/SymbolTable.cpp                  | 16 ++++-
 lld/COFF/SymbolTable.h                    |  4 +-
 lld/COFF/Writer.cpp                       |  2 +
 lld/test/COFF/Inputs/loadconfig-arm64ec.s |  2 +
 lld/test/COFF/arm64ec-import.test         | 84 ++++++++++++++++++++---
 13 files changed, 176 insertions(+), 19 deletions(-)

diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index 060eb6c32004d..0f33885f7df37 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -1093,4 +1093,26 @@ void CHPERedirectionChunk::writeTo(uint8_t *buf) const {
   }
 }
 
+ImportThunkChunkARM64EC::ImportThunkChunkARM64EC(ImportFile *file)
+    : ImportThunkChunk(file->ctx, file->impSym), file(file) {}
+
+void ImportThunkChunkARM64EC::writeTo(uint8_t *buf) const {
+  memcpy(buf, importThunkARM64EC, sizeof(importThunkARM64EC));
+  applyArm64Addr(buf, file->impSym->getRVA(), rva, 12);
+  applyArm64Ldr(buf + 4, file->impSym->getRVA() & 0xfff);
+
+  // The exit thunk may be missing. This can happen if the application only
+  // references a function by its address (in which case the thunk is never
+  // actually used, but is still required to fill the auxiliary IAT), or in
+  // cases of hand-written assembly calling an imported ARM64EC function (where
+  // the exit thunk is ignored by __icall_helper_arm64ec). In such cases, MSVC
+  // link.exe uses 0 as the RVA.
+  uint32_t exitThunkRVA = exitThunk ? exitThunk->getRVA() : 0;
+  applyArm64Addr(buf + 8, exitThunkRVA, rva + 8, 12);
+  applyArm64Imm(buf + 12, exitThunkRVA & 0xfff, 0);
+
+  Defined *helper = cast<Defined>(file->ctx.config.arm64ECIcallHelper);
+  applyArm64Branch26(buf + 16, helper->getRVA() - rva - 16);
+}
+
 } // namespace lld::coff
diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 30e5b538c352e..28e0fd68ac515 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -544,6 +544,14 @@ static const uint8_t importThunkARM64[] = {
     0x00, 0x02, 0x1f, 0xd6, // br   x16
 };
 
+static const uint32_t importThunkARM64EC[] = {
+    0x9000000b, // adrp    x11, 0x0
+    0xf940016b, // ldr     x11, [x11]
+    0x9000000a, // adrp    x10, 0x0
+    0x9100014a, // add     x10, x10, #0x0
+    0x14000000  // b       0x0
+};
+
 // Windows-specific.
 // A chunk for DLL import jump table entry. In a final output, its
 // contents will be a JMP instruction to some __imp_ symbol.
@@ -599,6 +607,22 @@ class ImportThunkChunkARM64 : public ImportThunkChunk {
   MachineTypes getMachine() const override { return ARM64; }
 };
 
+// ARM64EC __impchk_* thunk implementation.
+// Performs an indirect call to an imported function pointer
+// using the __icall_helper_arm64ec helper function.
+class ImportThunkChunkARM64EC : public ImportThunkChunk {
+public:
+  explicit ImportThunkChunkARM64EC(ImportFile *file);
+  size_t getSize() const override { return sizeof(importThunkARM64EC); };
+  MachineTypes getMachine() const override { return ARM64EC; }
+  void writeTo(uint8_t *buf) const override;
+
+  Defined *exitThunk;
+
+private:
+  ImportFile *file;
+};
+
 class RangeExtensionThunkARM : public NonSectionCodeChunk {
 public:
   explicit RangeExtensionThunkARM(COFFLinkerContext &ctx, Defined *t)
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 947f3fead54e0..738776a971ea3 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -164,6 +164,7 @@ struct Configuration {
   std::set<std::string> delayLoads;
   std::map<std::string, int> dllOrder;
   Symbol *delayLoadHelper = nullptr;
+  Symbol *arm64ECIcallHelper = nullptr;
 
   bool saveTemps = false;
 
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 3ef9fa3f65c6a..a1fe6444991a3 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -1383,6 +1383,11 @@ void LinkerDriver::createECExportThunks() {
   }
 }
 
+void LinkerDriver::pullArm64ECIcallHelper() {
+  if (!ctx.config.arm64ECIcallHelper)
+    ctx.config.arm64ECIcallHelper = addUndefined("__icall_helper_arm64ec");
+}
+
 // In MinGW, if no symbols are chosen to be exported, then all symbols are
 // automatically exported by default. This behavior can be forced by the
 // -export-all-symbols option, so that it happens even when exports are
@@ -2685,7 +2690,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   if (auto *arg = args.getLastArg(OPT_print_symbol_order))
     config->printSymbolOrder = arg->getValue();
 
-  ctx.symtab.initializeEntryThunks();
+  ctx.symtab.initializeECThunks();
 
   // Identify unreferenced COMDAT sections.
   if (config->doGC) {
diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h
index b5cf8e2f18fd4..0c195a7cc3148 100644
--- a/lld/COFF/Driver.h
+++ b/lld/COFF/Driver.h
@@ -101,6 +101,8 @@ class LinkerDriver {
 
   std::unique_ptr<llvm::TarWriter> tar; // for /linkrepro
 
+  void pullArm64ECIcallHelper();
+
 private:
   // Searches a file from search paths.
   std::optional<StringRef> findFileIfNew(StringRef filename);
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index c7956baf73cf4..3dbdf8fe3920d 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -190,6 +190,8 @@ void ObjFile::initializeECThunks() {
         ctx.symtab.addEntryThunk(getSymbol(entry->src), getSymbol(entry->dst));
         break;
       case Arm64ECThunkType::Exit:
+        ctx.symtab.addExitThunk(getSymbol(entry->src), getSymbol(entry->dst));
+        break;
       case Arm64ECThunkType::GuestExit:
         break;
       default:
@@ -1088,6 +1090,11 @@ void ImportFile::parse() {
       thunkSym = ctx.symtab.addImportThunk(
           name, impSym, make<ImportThunkChunkX64>(ctx, impSym));
       // FIXME: Add aux IAT symbols.
+
+      StringRef impChkName = saver().save("__impchk_" + name);
+      impchkThunk = make<ImportThunkChunkARM64EC>(this);
+      ctx.symtab.addImportThunk(impChkName, impSym, impchkThunk);
+      ctx.driver.pullArm64ECIcallHelper();
     }
   }
 }
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index 1d55b4f34f754..3b837017e1c21 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -56,6 +56,7 @@ class DefinedImportData;
 class DefinedImportThunk;
 class DefinedRegular;
 class ImportThunkChunk;
+class ImportThunkChunkARM64EC;
 class SectionChunk;
 class Symbol;
 class Undefined;
@@ -349,6 +350,7 @@ class ImportFile : public InputFile {
 
   DefinedImportData *impSym = nullptr;
   Symbol *thunkSym = nullptr;
+  ImportThunkChunkARM64EC *impchkThunk = nullptr;
   std::string dllName;
 
 private:
diff --git a/lld/COFF/MarkLive.cpp b/lld/COFF/MarkLive.cpp
index 06079a98f2d00..8af58780e1358 100644
--- a/lld/COFF/MarkLive.cpp
+++ b/lld/COFF/MarkLive.cpp
@@ -43,13 +43,23 @@ void markLive(COFFLinkerContext &ctx) {
     worklist.push_back(c);
   };
 
-  auto addSym = [&](Symbol *b) {
-    if (auto *sym = dyn_cast<DefinedRegular>(b))
+  std::function<void(Symbol *)> addSym;
+
+  auto addImportFile = [&](ImportFile *file) {
+    file->live = true;
+    if (file->impchkThunk && file->impchkThunk->exitThunk)
+      addSym(file->impchkThunk->exitThunk);
+  };
+
+  addSym = [&](Symbol *b) {
+    if (auto *sym = dyn_cast<DefinedRegular>(b)) {
       enqueue(sym->getChunk());
-    else if (auto *sym = dyn_cast<DefinedImportData>(b))
-      sym->file->live = true;
-    else if (auto *sym = dyn_cast<DefinedImportThunk>(b))
-      sym->wrappedSym->file->live = sym->wrappedSym->file->thunkLive = true;
+    } else if (auto *sym = dyn_cast<DefinedImportData>(b)) {
+      addImportFile(sym->file);
+    } else if (auto *sym = dyn_cast<DefinedImportThunk>(b)) {
+      addImportFile(sym->wrappedSym->file);
+      sym->wrappedSym->file->thunkLive = true;
+    }
   };
 
   // Add GC root chunks.
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index c9b3d78e3de17..a6575ecac3bb4 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -557,7 +557,11 @@ void SymbolTable::addEntryThunk(Symbol *from, Symbol *to) {
   entryThunks.push_back({from, to});
 }
 
-void SymbolTable::initializeEntryThunks() {
+void SymbolTable::addExitThunk(Symbol *from, Symbol *to) {
+  exitThunks[from] = to;
+}
+
+void SymbolTable::initializeECThunks() {
   for (auto it : entryThunks) {
     auto *to = dyn_cast<Defined>(it.second);
     if (!to)
@@ -573,6 +577,16 @@ void SymbolTable::initializeEntryThunks() {
     }
     from->getChunk()->setEntryThunk(to);
   }
+
+  for (ImportFile *file : ctx.importFileInstances) {
+    if (!file->impchkThunk)
+      continue;
+
+    Symbol *sym = exitThunks.lookup(file->thunkSym);
+    if (!sym)
+      sym = exitThunks.lookup(file->impSym);
+    file->impchkThunk->exitThunk = dyn_cast_or_null<Defined>(sym);
+  }
 }
 
 Symbol *SymbolTable::addUndefined(StringRef name, InputFile *f,
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index 3a277fc700e86..13e151e3a8c50 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -108,7 +108,8 @@ class SymbolTable {
                          ImportThunkChunk *chunk);
   void addLibcall(StringRef name);
   void addEntryThunk(Symbol *from, Symbol *to);
-  void initializeEntryThunks();
+  void addExitThunk(Symbol *from, Symbol *to);
+  void initializeECThunks();
 
   void reportDuplicate(Symbol *existing, InputFile *newFile,
                        SectionChunk *newSc = nullptr,
@@ -141,6 +142,7 @@ class SymbolTable {
   std::unique_ptr<BitcodeCompiler> lto;
   bool ltoCompilationDone = false;
   std::vector<std::pair<Symbol *, Symbol *>> entryThunks;
+  llvm::DenseMap<Symbol *, Symbol *> exitThunks;
 
   COFFLinkerContext &ctx;
 };
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 3cb9b3b512ead..b589a16bca32a 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1248,6 +1248,8 @@ void Writer::appendImportThunks() {
     DefinedImportThunk *thunk = cast<DefinedImportThunk>(file->thunkSym);
     if (file->thunkLive)
       textSec->addChunk(thunk->getChunk());
+    if (file->impchkThunk)
+      textSec->addChunk(file->impchkThunk);
   }
 
   if (!delayIdata.empty()) {
diff --git a/lld/test/COFF/Inputs/loadconfig-arm64ec.s b/lld/test/COFF/Inputs/loadconfig-arm64ec.s
index 78e7fba43a0a4..75dc6105301d0 100644
--- a/lld/test/COFF/Inputs/loadconfig-arm64ec.s
+++ b/lld/test/COFF/Inputs/loadconfig-arm64ec.s
@@ -30,6 +30,8 @@ __os_arm64x_dispatch_ret:
         .xword 0
 __os_arm64x_check_call:
         .xword 0
+        .globl __os_arm64x_dispatch_icall
+__os_arm64x_dispatch_icall:
 __os_arm64x_check_icall:
         .xword 0
 __os_arm64x_get_x64_information:
diff --git a/lld/test/COFF/arm64ec-import.test b/lld/test/COFF/arm64ec-import.test
index b1c47d785e445..44a84c09e11a3 100644
--- a/lld/test/COFF/arm64ec-import.test
+++ b/lld/test/COFF/arm64ec-import.test
@@ -2,17 +2,19 @@ REQUIRES: aarch64, x86
 RUN: split-file %s %t.dir && cd %t.dir
 
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows icall.s -o icall.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows hybmp.s -o hybmp.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
 RUN: llvm-lib -machine:arm64ec -def:test.def -out:test-arm64ec.lib
 RUN: llvm-lib -machine:arm64ec -def:test2.def -out:test2-arm64ec.lib
 RUN: llvm-lib -machine:x64 -def:test.def -out:test-x86_64.lib
 
 Link using ARM64EC import library:
-RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj \
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj icall.obj hybmp.obj \
 RUN:          test.obj test-arm64ec.lib test2-arm64ec.lib
 
 Link using x86_64 import library:
-RUN: lld-link -machine:arm64ec -dll -noentry -out:out2.dll loadconfig-arm64ec.obj \
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out2.dll loadconfig-arm64ec.obj icall.obj hybmp.obj \
 RUN:          test.obj test-x86_64.lib test2-arm64ec.lib
 
 RUN: llvm-readobj --coff-imports out.dll | FileCheck --check-prefix=IMPORTS %s
@@ -20,7 +22,7 @@ RUN: llvm-readobj --coff-imports out2.dll | FileCheck --check-prefix=IMPORTS %s
 IMPORTS:      Import {
 IMPORTS-NEXT:   Name: test.dll
 IMPORTS-NEXT:   ImportLookupTableRVA:
-IMPORTS-NEXT:   ImportAddressTableRVA: 0x2000
+IMPORTS-NEXT:   ImportAddressTableRVA: 0x3000
 IMPORTS-NEXT:   Symbol: data (0)
 IMPORTS-NEXT:   Symbol: func (0)
 IMPORTS-NEXT:   Symbol: func2 (0)
@@ -28,24 +30,45 @@ IMPORTS-NEXT: }
 IMPORTS-NEXT: Import {
 IMPORTS-NEXT:   Name: test2.dll
 IMPORTS-NEXT:   ImportLookupTableRVA:
-IMPORTS-NEXT:   ImportAddressTableRVA: 0x2020
+IMPORTS-NEXT:   ImportAddressTableRVA: 0x3020
 IMPORTS-NEXT:   Symbol: t2func (0)
 IMPORTS-NEXT: }
 
 RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s
 RUN: llvm-objdump -d out2.dll | FileCheck --check-prefix=DISASM %s
 
-DISASM:      0000000180001000 <.text>:
-DISASM-NEXT: 180001000: ff 25 02 10 00 00            jmpq    *0x1002(%rip)           # 0x180002008
+DISASM:      180001000: 52800000     mov     w0, #0x0                // =0
+DISASM-NEXT: 180001004: d65f03c0     ret
+DISASM-NEXT: 180001008: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 18000100c: f940056b     ldr     x11, [x11, #0x8]
+DISASM-NEXT: 180001010: 9000000a     adrp    x10, 0x180001000 <.text>
+DISASM-NEXT: 180001014: 9101114a     add     x10, x10, #0x44
+DISASM-NEXT: 180001018: 17fffffa     b       0x180001000 <.text>
+DISASM-NEXT: 18000101c: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 180001020: f940096b     ldr     x11, [x11, #0x10]
+DISASM-NEXT: 180001024: f0ffffea     adrp    x10, 0x180000000
+DISASM-NEXT: 180001028: 9100014a     add     x10, x10, #0x0
+DISASM-NEXT: 18000102c: 17fffff5     b       0x180001000 <.text>
+DISASM-NEXT: 180001030: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 180001034: f940116b     ldr     x11, [x11, #0x20]
+DISASM-NEXT: 180001038: 9000000a     adrp    x10, 0x180001000 <.text>
+DISASM-NEXT: 18000103c: 9101314a     add     x10, x10, #0x4c
+DISASM-NEXT: 180001040: 17fffff0     b       0x180001000 <.text>
+DISASM-NEXT: 180001044: 52800020     mov     w0, #0x1                // =1
+DISASM-NEXT: 180001048: d65f03c0     ret
+DISASM-NEXT: 18000104c: 52800040     mov     w0, #0x2                // =2
+DISASM-NEXT: 180001050: d65f03c0     ret
+DISASM-NEXT:                 ...
+DISASM-NEXT: 180002000: ff 25 02 10 00 00            jmpq    *0x1002(%rip)           # 0x180003008
 
 RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s
 RUN: llvm-readobj --hex-dump=.test out2.dll | FileCheck --check-prefix=TESTSEC %s
-TESTSEC:      0x180005000 08200000 00200000 10200000 20200000
-TESTSEC-NEXT: 0x180005010 00100000
+TESTSEC:      0x180006000 08300000 00300000 10300000 20300000
+TESTSEC-NEXT: 0x180006010 08100000 1c100000          00200000
 
 RUN: llvm-readobj --headers out.dll | FileCheck -check-prefix=HEADERS %s
-HEADERS:  LoadConfigTableRVA: 0x3008
-HEADERS:  IATRVA: 0x2000
+HEADERS:  LoadConfigTableRVA: 0x4010
+HEADERS:  IATRVA: 0x3000
 HEADERS:  IATSize: 0x1000
 
 #--- test.s
@@ -57,8 +80,49 @@ arm64ec_data_sym:
     .rva __imp_data
     .rva __imp_func2
     .rva __imp_t2func
+    .rva __impchk_func
+    .rva __impchk_func2
     .rva func
 
+#--- icall.s
+    .text
+    .globl __icall_helper_arm64ec
+    .p2align 2, 0x0
+__icall_helper_arm64ec:
+    mov w0, #0
+    ret
+
+#--- hybmp.s
+    .section .hybmp$x, "yi"
+    // __imp_func exit thunk is ignored when func is defined as well
+    .symidx __imp_func
+    .symidx dead_exit_thunk
+    .word 4
+    .symidx func
+    .symidx func_exit_thunk
+    .word 4
+    .symidx __imp_t2func
+    .symidx t2func_exit_thunk
+    .word 4
+
+    .section .wowthk$aa,"xr",discard,func_exit_thunk
+    .globl func_exit_thunk
+func_exit_thunk:
+    mov w0, #1
+    ret
+
+    .section .wowthk$aa,"xr",discard,t2func_exit_thunk
+    .globl t2func_exit_thunk
+t2func_exit_thunk:
+    mov w0, #2
+    ret
+
+    .section .wowthk$aa,"xr",discard,dead_exit_thunk
+    .globl dead_exit_thunk
+dead_exit_thunk:
+    mov w0, #0xdead
+    ret
+
 #--- test.def
 NAME test.dll
 EXPORTS

From 5904448ceb67d6a7bd752aa4b54d9acb64bcc533 Mon Sep 17 00:00:00 2001
From: Tulio Magno Quites Machado Filho <tuliom@redhat.com>
Date: Wed, 11 Sep 2024 09:57:22 -0300
Subject: [PATCH 094/114] Avoid exposing password and token from git
 repositories (#105220)

Try to detect if the git remote URL has a password or a Github token and
return an error teaching the user how to avoid leaking their password or
token.
---
 llvm/cmake/modules/VersionFromVCS.cmake | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/llvm/cmake/modules/VersionFromVCS.cmake b/llvm/cmake/modules/VersionFromVCS.cmake
index 18edbeabe3e4b..da42781d2ae39 100644
--- a/llvm/cmake/modules/VersionFromVCS.cmake
+++ b/llvm/cmake/modules/VersionFromVCS.cmake
@@ -39,6 +39,30 @@ function(get_source_info path revision repository)
         OUTPUT_VARIABLE git_output
         ERROR_QUIET)
       if(git_result EQUAL 0)
+        # Passwords or tokens should not be stored in the remote URL at the
+        # risk of being leaked. In case we find one, error out and teach the
+        # user the best practices.
+        string(REGEX MATCH "https?://[^/]*:[^/]*@.*"
+          http_password "${git_output}")
+        if(http_password)
+          message(SEND_ERROR "The git remote repository URL has an embedded \
+password. Remove the password from the URL or use \
+`-DLLVM_FORCE_VC_REPOSITORY=<URL without password>` in order to avoid \
+leaking your password (see https://git-scm.com/docs/gitcredentials for \
+alternatives).")
+        endif()
+        # GitHub token formats are described at:
+        # https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/about-authentication-to-github#githubs-token-formats
+        string(REGEX MATCH
+          "https?://(gh[pousr]|github_pat)_[^/]+@github.com.*"
+          github_token "${git_output}")
+        if(github_token)
+          message(SEND_ERROR "The git remote repository URL has an embedded \
+GitHub Token. Remove the token from the URL or use \
+`-DLLVM_FORCE_VC_REPOSITORY=<URL without token>` in order to avoid leaking \
+your token (see https://git-scm.com/docs/gitcredentials for alternatives).")
+        endif()
+
         string(STRIP "${git_output}" git_output)
         set(${repository} ${git_output} PARENT_SCOPE)
       else()

From 5f25b89513954b00e045b9fdf1a16f3a34e04c52 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 11 Sep 2024 05:57:50 -0700
Subject: [PATCH 095/114] [TableGen] Migrate Option Emitters to const
 RecordKeeper (#107696)

Migrate Opt/OptRST Emitters to const RecordKeeper.

This is a part of effort to have better const correctness in TableGen
backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 llvm/utils/TableGen/Common/OptEmitter.cpp | 16 ++++++------
 llvm/utils/TableGen/Common/OptEmitter.h   |  4 +--
 llvm/utils/TableGen/ExegesisEmitter.cpp   | 17 +++++++------
 llvm/utils/TableGen/OptParserEmitter.cpp  |  8 +++---
 llvm/utils/TableGen/OptRSTEmitter.cpp     | 30 +++++++++--------------
 5 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/llvm/utils/TableGen/Common/OptEmitter.cpp b/llvm/utils/TableGen/Common/OptEmitter.cpp
index 7fcf3074e0931..1c91ec5b3dbc4 100644
--- a/llvm/utils/TableGen/Common/OptEmitter.cpp
+++ b/llvm/utils/TableGen/Common/OptEmitter.cpp
@@ -1,4 +1,4 @@
-//===- OptEmitter.cpp - Helper for emitting options.----------- -----------===//
+//===- OptEmitter.cpp - Helper for emitting options -------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -39,21 +39,19 @@ static int StrCmpOptionName(const char *A, const char *B) {
   return (a < b) ? -1 : 1;
 }
 
-int CompareOptionRecords(Record *const *Av, Record *const *Bv) {
-  const Record *A = *Av;
-  const Record *B = *Bv;
-
+// Returns true if A is ordered before B.
+bool CompareOptionRecords(const Record *A, const Record *B) {
   // Sentinel options precede all others and are only ordered by precedence.
   bool ASent = A->getValueAsDef("Kind")->getValueAsBit("Sentinel");
   bool BSent = B->getValueAsDef("Kind")->getValueAsBit("Sentinel");
   if (ASent != BSent)
-    return ASent ? -1 : 1;
+    return ASent;
 
   // Compare options by name, unless they are sentinels.
   if (!ASent)
     if (int Cmp = StrCmpOptionName(A->getValueAsString("Name").str().c_str(),
                                    B->getValueAsString("Name").str().c_str()))
-      return Cmp;
+      return Cmp < 0;
 
   if (!ASent) {
     std::vector<StringRef> APrefixes = A->getValueAsListOfStrings("Prefixes");
@@ -65,7 +63,7 @@ int CompareOptionRecords(Record *const *Av, Record *const *Bv) {
                                                 BEPre = BPrefixes.end();
          APre != AEPre && BPre != BEPre; ++APre, ++BPre) {
       if (int Cmp = StrCmpOptionName(APre->str().c_str(), BPre->str().c_str()))
-        return Cmp;
+        return Cmp < 0;
     }
   }
 
@@ -78,7 +76,7 @@ int CompareOptionRecords(Record *const *Av, Record *const *Bv) {
     PrintError(B->getLoc(), Twine("Other defined here"));
     PrintFatalError("Equivalent Options found.");
   }
-  return APrec < BPrec ? -1 : 1;
+  return APrec < BPrec;
 }
 
 } // namespace llvm
diff --git a/llvm/utils/TableGen/Common/OptEmitter.h b/llvm/utils/TableGen/Common/OptEmitter.h
index eaef966bbac66..5eecd61987337 100644
--- a/llvm/utils/TableGen/Common/OptEmitter.h
+++ b/llvm/utils/TableGen/Common/OptEmitter.h
@@ -1,4 +1,4 @@
-//===- OptEmitter.h - Helper for emitting options. --------------*- C++ -*-===//
+//===- OptEmitter.h - Helper for emitting options ---------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,7 +11,7 @@
 
 namespace llvm {
 class Record;
-int CompareOptionRecords(Record *const *Av, Record *const *Bv);
+bool CompareOptionRecords(const Record *A, const Record *B);
 } // namespace llvm
 
 #endif // LLVM_UTILS_TABLEGEN_COMMON_OPTEMITTER_H
diff --git a/llvm/utils/TableGen/ExegesisEmitter.cpp b/llvm/utils/TableGen/ExegesisEmitter.cpp
index 0de7cb4233748..a5dd2994b3753 100644
--- a/llvm/utils/TableGen/ExegesisEmitter.cpp
+++ b/llvm/utils/TableGen/ExegesisEmitter.cpp
@@ -30,7 +30,7 @@ namespace {
 
 class ExegesisEmitter {
 public:
-  ExegesisEmitter(RecordKeeper &RK);
+  ExegesisEmitter(const RecordKeeper &RK);
 
   void run(raw_ostream &OS) const;
 
@@ -51,7 +51,7 @@ class ExegesisEmitter {
 
   void emitPfmCountersLookupTable(raw_ostream &OS) const;
 
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
   std::string Target;
 
   // Table of counter name -> counter index.
@@ -59,7 +59,7 @@ class ExegesisEmitter {
 };
 
 static std::map<llvm::StringRef, unsigned>
-collectPfmCounters(RecordKeeper &Records) {
+collectPfmCounters(const RecordKeeper &Records) {
   std::map<llvm::StringRef, unsigned> PfmCounterNameTable;
   const auto AddPfmCounterName = [&PfmCounterNameTable](
                                      const Record *PfmCounterDef) {
@@ -67,7 +67,8 @@ collectPfmCounters(RecordKeeper &Records) {
     if (!Counter.empty())
       PfmCounterNameTable.emplace(Counter, 0);
   };
-  for (Record *Def : Records.getAllDerivedDefinitions("ProcPfmCounters")) {
+  for (const Record *Def :
+       Records.getAllDerivedDefinitions("ProcPfmCounters")) {
     // Check that ResourceNames are unique.
     llvm::SmallSet<llvm::StringRef, 16> Seen;
     for (const Record *IssueCounter :
@@ -95,9 +96,9 @@ collectPfmCounters(RecordKeeper &Records) {
   return PfmCounterNameTable;
 }
 
-ExegesisEmitter::ExegesisEmitter(RecordKeeper &RK)
+ExegesisEmitter::ExegesisEmitter(const RecordKeeper &RK)
     : Records(RK), PfmCounterNameTable(collectPfmCounters(RK)) {
-  std::vector<Record *> Targets = Records.getAllDerivedDefinitions("Target");
+  ArrayRef<const Record *> Targets = Records.getAllDerivedDefinitions("Target");
   if (Targets.size() == 0)
     PrintFatalError("No 'Target' subclasses defined!");
   if (Targets.size() != 1)
@@ -223,7 +224,7 @@ void ExegesisEmitter::emitPfmCounters(raw_ostream &OS) const {
 } // namespace
 
 void ExegesisEmitter::emitPfmCountersLookupTable(raw_ostream &OS) const {
-  std::vector<Record *> Bindings =
+  std::vector<const Record *> Bindings =
       Records.getAllDerivedDefinitions("PfmCountersBinding");
   assert(!Bindings.empty() && "there must be at least one binding");
   llvm::sort(Bindings, [](const Record *L, const Record *R) {
@@ -232,7 +233,7 @@ void ExegesisEmitter::emitPfmCountersLookupTable(raw_ostream &OS) const {
 
   OS << "// Sorted (by CpuName) array of pfm counters.\n"
      << "static const CpuAndPfmCounters " << Target << "CpuPfmCounters[] = {\n";
-  for (Record *Binding : Bindings) {
+  for (const Record *Binding : Bindings) {
     // Emit as { "cpu", procinit },
     OS << "  { \""                                                        //
        << Binding->getValueAsString("CpuName") << "\","                   //
diff --git a/llvm/utils/TableGen/OptParserEmitter.cpp b/llvm/utils/TableGen/OptParserEmitter.cpp
index 81195c8c106c2..a41c684f169e9 100644
--- a/llvm/utils/TableGen/OptParserEmitter.cpp
+++ b/llvm/utils/TableGen/OptParserEmitter.cpp
@@ -250,15 +250,15 @@ static void EmitHelpTextsForVariants(
 /// OptParserEmitter - This tablegen backend takes an input .td file
 /// describing a list of options and emits a data structure for parsing and
 /// working with those options when given an input command line.
-static void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
+static void EmitOptParser(const RecordKeeper &Records, raw_ostream &OS) {
   // Get the option groups and options.
-  const std::vector<Record *> &Groups =
+  ArrayRef<const Record *> Groups =
       Records.getAllDerivedDefinitions("OptionGroup");
-  std::vector<Record *> Opts = Records.getAllDerivedDefinitions("Option");
+  std::vector<const Record *> Opts = Records.getAllDerivedDefinitions("Option");
 
   emitSourceFileHeader("Option Parsing Definitions", OS);
 
-  array_pod_sort(Opts.begin(), Opts.end(), CompareOptionRecords);
+  llvm::sort(Opts, CompareOptionRecords);
   // Generate prefix groups.
   typedef SmallVector<SmallString<2>, 2> PrefixKeyT;
   typedef std::map<PrefixKeyT, std::string> PrefixesT;
diff --git a/llvm/utils/TableGen/OptRSTEmitter.cpp b/llvm/utils/TableGen/OptRSTEmitter.cpp
index 75b7cbdf29887..43b0f78c44d90 100644
--- a/llvm/utils/TableGen/OptRSTEmitter.cpp
+++ b/llvm/utils/TableGen/OptRSTEmitter.cpp
@@ -16,30 +16,24 @@ using namespace llvm;
 
 /// OptParserEmitter - This tablegen backend takes an input .td file
 /// describing a list of options and emits a RST man page.
-static void EmitOptRST(RecordKeeper &Records, raw_ostream &OS) {
-  llvm::StringMap<std::vector<Record *>> OptionsByGroup;
+static void EmitOptRST(const RecordKeeper &Records, raw_ostream &OS) {
+  llvm::StringMap<std::vector<const Record *>> OptionsByGroup;
   std::vector<Record *> OptionsWithoutGroup;
 
   // Get the options.
-  std::vector<Record *> Opts = Records.getAllDerivedDefinitions("Option");
-  array_pod_sort(Opts.begin(), Opts.end(), CompareOptionRecords);
+  std::vector<const Record *> Opts = Records.getAllDerivedDefinitions("Option");
+  llvm::sort(Opts, CompareOptionRecords);
 
   // Get the option groups.
-  const std::vector<Record *> &Groups =
-      Records.getAllDerivedDefinitions("OptionGroup");
-  for (unsigned i = 0, e = Groups.size(); i != e; ++i) {
-    const Record &R = *Groups[i];
-    OptionsByGroup.try_emplace(R.getValueAsString("Name"));
-  }
+  for (const Record *R : Records.getAllDerivedDefinitions("OptionGroup"))
+    OptionsByGroup.try_emplace(R->getValueAsString("Name"));
 
   // Map options to their group.
-  for (unsigned i = 0, e = Opts.size(); i != e; ++i) {
-    const Record &R = *Opts[i];
-    if (const DefInit *DI = dyn_cast<DefInit>(R.getValueInit("Group"))) {
-      OptionsByGroup[DI->getDef()->getValueAsString("Name")].push_back(Opts[i]);
-    } else {
-      OptionsByGroup["options"].push_back(Opts[i]);
-    }
+  for (const Record *R : Opts) {
+    if (const DefInit *DI = dyn_cast<DefInit>(R->getValueInit("Group")))
+      OptionsByGroup[DI->getDef()->getValueAsString("Name")].push_back(R);
+    else
+      OptionsByGroup["options"].push_back(R);
   }
 
   // Print options under their group.
@@ -49,7 +43,7 @@ static void EmitOptRST(RecordKeeper &Records, raw_ostream &OS) {
     OS << std::string(GroupName.size(), '-') << '\n';
     OS << '\n';
 
-    for (Record *R : KV.getValue()) {
+    for (const Record *R : KV.getValue()) {
       OS << ".. option:: ";
 
       // Print the prefix.

From 6043321127fa3c51481beaee683a34c2d2ca468d Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 11 Sep 2024 08:57:58 -0400
Subject: [PATCH 096/114] [gn] port bc152fbf4315 (llvm-debuginfod-find
 driver_exe)

---
 .../llvm/tools/llvm-debuginfod-find/BUILD.gn          | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn
index 16b2c53438314..6b926bc777dca 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn
@@ -1,6 +1,13 @@
 import("//llvm/tools/binutils_symlinks.gni")
+import("//llvm/utils/TableGen/tablegen.gni")
+import("//llvm/utils/gn/build/driver_executable.gni")
 import("//llvm/utils/gn/build/symlink_or_copy.gni")
 
+tablegen("Opts") {
+  visibility = [ ":llvm-debuginfod-find" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
 if (llvm_install_binutils_symlinks) {
   symlink_or_copy("debuginfod-find") {
     deps = [ ":llvm-debuginfod-find" ]
@@ -18,9 +25,11 @@ group("symlinks") {
   }
 }
 
-executable("llvm-debuginfod-find") {
+driver_executable("llvm-debuginfod-find") {
   deps = [
+    ":Opts",
     "//llvm/lib/Debuginfod",
+    "//llvm/lib/Option",
     "//llvm/lib/Support",
   ]
   sources = [ "llvm-debuginfod-find.cpp" ]

From 135bd31975192654629c9bd453533ba705af1dba Mon Sep 17 00:00:00 2001
From: Alex Rice <alexrice999@hotmail.co.uk>
Date: Wed, 11 Sep 2024 14:02:44 +0100
Subject: [PATCH 097/114] [mlir] [tblgen-to-irdl] Refactor tblgen-to-irdl
 script and support more types (#105505)

Refactors the tblgen-to-irdl script slightly and adds support for
- Various integer types
- Various Float types
- Confined types
- Complex types (with fixed element type)

Also doesn't add the operand and result ops if they are empty.

I could potentially split this into smaller PRs if that'd be helpful
(refactor + integer/float/complex, confined type, optional
operand/result).

@math-fehr
---
 mlir/include/mlir/IR/CommonTypeConstraints.td |   5 +-
 mlir/test/tblgen-to-irdl/CMathDialect.td      |   1 -
 mlir/test/tblgen-to-irdl/TestDialect.td       |  59 +++++-
 .../tools/tblgen-to-irdl/OpDefinitionsGen.cpp | 179 +++++++++++++++++-
 4 files changed, 227 insertions(+), 17 deletions(-)

diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td
index 09eab50f53a54..0a1521f8ddfb8 100644
--- a/mlir/include/mlir/IR/CommonTypeConstraints.td
+++ b/mlir/include/mlir/IR/CommonTypeConstraints.td
@@ -198,7 +198,10 @@ class AllOfType<list<Type> allowedTypeList, string summary = "",
 class ConfinedType<Type type, list<Pred> predicates, string summary = "",
                    string cppType = type.cppType> : Type<
     And<!listconcat([type.predicate], !foreach(pred, predicates, pred))>,
-    summary, cppType>;
+    summary, cppType> {
+    Type baseType = type;
+    list<Pred> predicateList = predicates;
+}
 
 // Integer types.
 
diff --git a/mlir/test/tblgen-to-irdl/CMathDialect.td b/mlir/test/tblgen-to-irdl/CMathDialect.td
index 5b9e756727cb3..454543e074c48 100644
--- a/mlir/test/tblgen-to-irdl/CMathDialect.td
+++ b/mlir/test/tblgen-to-irdl/CMathDialect.td
@@ -25,7 +25,6 @@ def CMath_ComplexType : CMath_Type<"ComplexType", "complex"> {
 
 // CHECK:      irdl.operation @identity {
 // CHECK-NEXT:   %0 = irdl.base "!cmath.complex"
-// CHECK-NEXT:   irdl.operands()
 // CHECK-NEXT:   irdl.results(%0)
 // CHECK-NEXT: }
 def CMath_IdentityOp : CMath_Op<"identity"> {
diff --git a/mlir/test/tblgen-to-irdl/TestDialect.td b/mlir/test/tblgen-to-irdl/TestDialect.td
index fc40da527db00..2622c81776076 100644
--- a/mlir/test/tblgen-to-irdl/TestDialect.td
+++ b/mlir/test/tblgen-to-irdl/TestDialect.td
@@ -28,9 +28,8 @@ def Test_AndOp : Test_Op<"and"> {
 // CHECK-LABEL: irdl.operation @and {
 // CHECK-NEXT:    %[[v0:[^ ]*]] = irdl.base "!test.singleton_a"
 // CHECK-NEXT:    %[[v1:[^ ]*]] = irdl.any
-// CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.all_of(%[[v0]], %[[v1]]) 
+// CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.all_of(%[[v0]], %[[v1]])
 // CHECK-NEXT:    irdl.operands(%[[v2]])
-// CHECK-NEXT:    irdl.results()
 // CHECK-NEXT:  }
 
 
@@ -41,9 +40,39 @@ def Test_AnyOp : Test_Op<"any"> {
 // CHECK-LABEL: irdl.operation @any {
 // CHECK-NEXT:    %[[v0:[^ ]*]] = irdl.any
 // CHECK-NEXT:    irdl.operands(%[[v0]])
-// CHECK-NEXT:    irdl.results()
 // CHECK-NEXT:  }
 
+// Check confined types are converted correctly.
+def Test_ConfinedOp : Test_Op<"confined"> {
+  let arguments = (ins ConfinedType<AnyType, [CPred<"::llvm::isa<::mlir::TensorType>($_self)">]>:$tensor,
+                       ConfinedType<AnyType, [And<[CPred<"::llvm::isa<::mlir::VectorType>($_self)">
+                                             , CPred<"::llvm::cast<::mlir::VectorType>($_self).getRank() > 0">]>]>:$vector);
+}
+// CHECK-LABEL: irdl.operation @confined {
+// CHECK-NEXT:    %[[v0:[^ ]*]] = irdl.any
+// CHECK-NEXT:    %[[v1:[^ ]*]] = irdl.c_pred "(::llvm::isa<::mlir::TensorType>($_self))"
+// CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.all_of(%[[v0]], %[[v1]])
+// CHECK-NEXT:    %[[v3:[^ ]*]] = irdl.any
+// CHECK-NEXT:    %[[v4:[^ ]*]] = irdl.c_pred "(::llvm::isa<::mlir::VectorType>($_self))"
+// CHECK-NEXT:    %[[v5:[^ ]*]] = irdl.c_pred "(::llvm::cast<::mlir::VectorType>($_self).getRank() > 0)"
+// CHECK-NEXT:    %[[v6:[^ ]*]] = irdl.all_of(%[[v4]], %[[v5]])
+// CHECK-NEXT:    %[[v7:[^ ]*]] = irdl.all_of(%[[v3]], %[[v6]])
+// CHECK-NEXT:    irdl.operands(%[[v2]], %[[v7]])
+// CHECK-NEXT:  }
+
+// Check generic integer types are converted correctly.
+def Test_Integers : Test_Op<"integers"> {
+  let arguments = (ins AnyI8:$any_int,
+                       AnyInteger:$any_integer);
+}
+// CHECK-LABEL: irdl.operation @integers {
+// CHECK-NEXT:    %[[v0:[^ ]*]] = irdl.is i8
+// CHECK-NEXT:    %[[v1:[^ ]*]] = irdl.is si8
+// CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.is ui8
+// CHECK-NEXT:    %[[v3:[^ ]*]] = irdl.any_of(%[[v0]], %[[v1]], %[[v2]])
+// CHECK-NEXT:    %[[v4:[^ ]*]] = irdl.base "!builtin.integer"
+// CHECK-NEXT:    irdl.operands(%[[v3]], %[[v4]])
+// CHECK-NEXT:  }
 
 // Check that AnyTypeOf is converted correctly.
 def Test_OrOp : Test_Op<"or"> {
@@ -53,11 +82,30 @@ def Test_OrOp : Test_Op<"or"> {
 // CHECK-NEXT:    %[[v0:[^ ]*]] = irdl.base "!test.singleton_a"
 // CHECK-NEXT:    %[[v1:[^ ]*]] = irdl.base "!test.singleton_b"
 // CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.base "!test.singleton_c"
-// CHECK-NEXT:    %[[v3:[^ ]*]] = irdl.any_of(%[[v0]], %[[v1]], %[[v2]]) 
+// CHECK-NEXT:    %[[v3:[^ ]*]] = irdl.any_of(%[[v0]], %[[v1]], %[[v2]])
 // CHECK-NEXT:    irdl.operands(%[[v3]])
-// CHECK-NEXT:    irdl.results()
 // CHECK-NEXT:  }
 
+// Check that various types are converted correctly.
+def Test_TypesOp : Test_Op<"types"> {
+  let arguments = (ins I32:$a,
+                       SI64:$b,
+                       UI8:$c,
+                       Index:$d,
+                       F32:$e,
+                       NoneType:$f,
+                       Complex<F8E4M3FN>);
+}
+// CHECK-LABEL: irdl.operation @types {
+// CHECK-NEXT:    %{{.*}} = irdl.is i32
+// CHECK-NEXT:    %{{.*}} = irdl.is si64
+// CHECK-NEXT:    %{{.*}} = irdl.is ui8
+// CHECK-NEXT:    %{{.*}} = irdl.is index
+// CHECK-NEXT:    %{{.*}} = irdl.is f32
+// CHECK-NEXT:    %{{.*}} = irdl.is none
+// CHECK-NEXT:    %{{.*}} = irdl.is complex<f8E4M3FN>
+// CHECK-NEXT:    irdl.operands({{.*}})
+// CHECK-NEXT:  }
 
 // Check that variadics and optionals are converted correctly.
 def Test_VariadicityOp : Test_Op<"variadicity"> {
@@ -70,5 +118,4 @@ def Test_VariadicityOp : Test_Op<"variadicity"> {
 // CHECK-NEXT:    %[[v1:[^ ]*]] = irdl.base "!test.singleton_b"
 // CHECK-NEXT:    %[[v2:[^ ]*]] = irdl.base "!test.singleton_c"
 // CHECK-NEXT:    irdl.operands(variadic %[[v0]], optional %[[v1]], %[[v2]])
-// CHECK-NEXT:    irdl.results()
 // CHECK-NEXT:  }
diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
index 4a13a00335f65..dd0d98de496e8 100644
--- a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
+++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
@@ -39,6 +39,131 @@ llvm::cl::opt<std::string>
     selectedDialect("dialect", llvm::cl::desc("The dialect to gen for"),
                     llvm::cl::cat(dialectGenCat), llvm::cl::Required);
 
+Value createPredicate(OpBuilder &builder, tblgen::Pred pred) {
+  MLIRContext *ctx = builder.getContext();
+
+  if (pred.isCombined()) {
+    auto combiner = pred.getDef().getValueAsDef("kind")->getName();
+    if (combiner == "PredCombinerAnd" || combiner == "PredCombinerOr") {
+      std::vector<Value> constraints;
+      for (auto *child : pred.getDef().getValueAsListOfDefs("children")) {
+        constraints.push_back(createPredicate(builder, tblgen::Pred(child)));
+      }
+      if (combiner == "PredCombinerAnd") {
+        auto op =
+            builder.create<irdl::AllOfOp>(UnknownLoc::get(ctx), constraints);
+        return op.getOutput();
+      }
+      auto op =
+          builder.create<irdl::AnyOfOp>(UnknownLoc::get(ctx), constraints);
+      return op.getOutput();
+    }
+  }
+
+  std::string condition = pred.getCondition();
+  // Build a CPredOp to match the C constraint built.
+  irdl::CPredOp op = builder.create<irdl::CPredOp>(
+      UnknownLoc::get(ctx), StringAttr::get(ctx, condition));
+  return op;
+}
+
+Value typeToConstraint(OpBuilder &builder, Type type) {
+  MLIRContext *ctx = builder.getContext();
+  auto op =
+      builder.create<irdl::IsOp>(UnknownLoc::get(ctx), TypeAttr::get(type));
+  return op.getOutput();
+}
+
+std::optional<Type> recordToType(MLIRContext *ctx, const Record &predRec) {
+
+  if (predRec.isSubClassOf("I")) {
+    auto width = predRec.getValueAsInt("bitwidth");
+    return IntegerType::get(ctx, width, IntegerType::Signless);
+  }
+
+  if (predRec.isSubClassOf("SI")) {
+    auto width = predRec.getValueAsInt("bitwidth");
+    return IntegerType::get(ctx, width, IntegerType::Signed);
+  }
+
+  if (predRec.isSubClassOf("UI")) {
+    auto width = predRec.getValueAsInt("bitwidth");
+    return IntegerType::get(ctx, width, IntegerType::Unsigned);
+  }
+
+  // Index type
+  if (predRec.getName() == "Index") {
+    return IndexType::get(ctx);
+  }
+
+  // Float types
+  if (predRec.isSubClassOf("F")) {
+    auto width = predRec.getValueAsInt("bitwidth");
+    switch (width) {
+    case 16:
+      return FloatType::getF16(ctx);
+    case 32:
+      return FloatType::getF32(ctx);
+    case 64:
+      return FloatType::getF64(ctx);
+    case 80:
+      return FloatType::getF80(ctx);
+    case 128:
+      return FloatType::getF128(ctx);
+    }
+  }
+
+  if (predRec.getName() == "NoneType") {
+    return NoneType::get(ctx);
+  }
+
+  if (predRec.getName() == "BF16") {
+    return FloatType::getBF16(ctx);
+  }
+
+  if (predRec.getName() == "TF32") {
+    return FloatType::getTF32(ctx);
+  }
+
+  if (predRec.getName() == "F8E4M3FN") {
+    return FloatType::getFloat8E4M3FN(ctx);
+  }
+
+  if (predRec.getName() == "F8E5M2") {
+    return FloatType::getFloat8E5M2(ctx);
+  }
+
+  if (predRec.getName() == "F8E4M3") {
+    return FloatType::getFloat8E4M3(ctx);
+  }
+
+  if (predRec.getName() == "F8E4M3FNUZ") {
+    return FloatType::getFloat8E4M3FNUZ(ctx);
+  }
+
+  if (predRec.getName() == "F8E4M3B11FNUZ") {
+    return FloatType::getFloat8E4M3B11FNUZ(ctx);
+  }
+
+  if (predRec.getName() == "F8E5M2FNUZ") {
+    return FloatType::getFloat8E5M2FNUZ(ctx);
+  }
+
+  if (predRec.getName() == "F8E3M4") {
+    return FloatType::getFloat8E3M4(ctx);
+  }
+
+  if (predRec.isSubClassOf("Complex")) {
+    const Record *elementRec = predRec.getValueAsDef("elementType");
+    auto elementType = recordToType(ctx, *elementRec);
+    if (elementType.has_value()) {
+      return ComplexType::get(elementType.value());
+    }
+  }
+
+  return std::nullopt;
+}
+
 Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) {
   MLIRContext *ctx = builder.getContext();
   const Record &predRec = constraint.getDef();
@@ -78,11 +203,45 @@ Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) {
     return op.getOutput();
   }
 
-  std::string condition = constraint.getPredicate().getCondition();
-  // Build a CPredOp to match the C constraint built.
-  irdl::CPredOp op = builder.create<irdl::CPredOp>(
-      UnknownLoc::get(ctx), StringAttr::get(ctx, condition));
-  return op;
+  // Integer types
+  if (predRec.getName() == "AnyInteger") {
+    auto op = builder.create<irdl::BaseOp>(
+        UnknownLoc::get(ctx), StringAttr::get(ctx, "!builtin.integer"));
+    return op.getOutput();
+  }
+
+  if (predRec.isSubClassOf("AnyI")) {
+    auto width = predRec.getValueAsInt("bitwidth");
+    std::vector<Value> types = {
+        typeToConstraint(builder,
+                         IntegerType::get(ctx, width, IntegerType::Signless)),
+        typeToConstraint(builder,
+                         IntegerType::get(ctx, width, IntegerType::Signed)),
+        typeToConstraint(builder,
+                         IntegerType::get(ctx, width, IntegerType::Unsigned))};
+    auto op = builder.create<irdl::AnyOfOp>(UnknownLoc::get(ctx), types);
+    return op.getOutput();
+  }
+
+  auto type = recordToType(ctx, predRec);
+
+  if (type.has_value()) {
+    return typeToConstraint(builder, type.value());
+  }
+
+  // Confined type
+  if (predRec.isSubClassOf("ConfinedType")) {
+    std::vector<Value> constraints;
+    constraints.push_back(createConstraint(
+        builder, tblgen::Constraint(predRec.getValueAsDef("baseType"))));
+    for (Record *child : predRec.getValueAsListOfDefs("predicateList")) {
+      constraints.push_back(createPredicate(builder, tblgen::Pred(child)));
+    }
+    auto op = builder.create<irdl::AllOfOp>(UnknownLoc::get(ctx), constraints);
+    return op.getOutput();
+  }
+
+  return createPredicate(builder, constraint.getPredicate());
 }
 
 /// Returns the name of the operation without the dialect prefix.
@@ -131,10 +290,12 @@ irdl::OperationOp createIRDLOperation(OpBuilder &builder,
   auto [results, resultVariadicity] = getValues(tblgenOp.getResults());
 
   // Create the operands and results operations.
-  consBuilder.create<irdl::OperandsOp>(UnknownLoc::get(ctx), operands,
-                                       operandVariadicity);
-  consBuilder.create<irdl::ResultsOp>(UnknownLoc::get(ctx), results,
-                                      resultVariadicity);
+  if (!operands.empty())
+    consBuilder.create<irdl::OperandsOp>(UnknownLoc::get(ctx), operands,
+                                         operandVariadicity);
+  if (!results.empty())
+    consBuilder.create<irdl::ResultsOp>(UnknownLoc::get(ctx), results,
+                                        resultVariadicity);
 
   return op;
 }

From 2a130f1a140613445b8f387d3fa54328c1b94cde Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Wed, 11 Sep 2024 14:03:01 +0100
Subject: [PATCH 098/114] [NFC][Clang][SVE] Refactor AArch64SVEACLETypes.def to
 enabled more uses. (#107599)

Some switch statements require all SVE builtin types to be manually
specified. This patch refactors the SVE_*_TYPE macros so that such code
can be generated during preprocessing.

I've tried to establish a minimal interface that covers all types where
no special information is required and then created a set of macros that
are dedicated to specific datatypes (i.e. int, float).

This patch is groundwork to simplify the changing of SVE tuple types to
become struct based as well as work to support the FP8 ACLE.
---
 .../clang/Basic/AArch64SVEACLETypes.def       | 178 +++++++++++-------
 clang/lib/AST/ASTContext.cpp                  | 162 +++++-----------
 clang/lib/AST/ItaniumMangle.cpp               |  16 +-
 clang/lib/CodeGen/CodeGenTypes.cpp            |  70 ++-----
 4 files changed, 178 insertions(+), 248 deletions(-)

diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def
index fa9c1ac0491c4..56e6179a664e2 100644
--- a/clang/include/clang/Basic/AArch64SVEACLETypes.def
+++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def
@@ -8,28 +8,48 @@
 //
 //  This file defines various SVE builtin types.  The macros are:
 //
-//    SVE_TYPE(Name, Id, SingletonId) - A builtin type that has not been
-//    covered by any other #define.  Defining this macro covers all
-//    the builtins.
+//    SVE_TYPE:
+//    - (Name, MangledName, Id, SingletonId)
+//    A builtin type that has not been covered by any other #define. Defining
+//    this macro covers all the builtin types.
 //
-//    SVE_VECTOR_TYPE(Name, Id, SingletonId, ElKind, ElBits, IsSigned, IsFP) -
-//    An SVE scalable vector.
+//    SVE_VECTOR_TYPE, SVE_PREDICATE_TYPE, SVE_OPAQUE_TYPE:
+//    - (Name, MangledName, Id, SingletonId)
+//    A builtin type that has not been covered by any other #define. Defining
+//    this macro covers the named subset of builtin types.
 //
-//    SVE_PREDICATE_TYPE(Name, Id, SingletonId, ElKind) - An SVE scalable
-//    predicate.
+//    SVE_VECTOR_TYPE_INT
+//    - (Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned)
+//    Defining the macro covers the integer vector types.
+//
+//    SVE_VECTOR_TYPE_FLOAT, SVE_VECTOR_TYPE_BFLOAT:
+//    - (Name, MangledName, Id, SingletonId, NumEls, ElBits, NF)
+//    Defining the macro covers the floating point vector types.
+//
+//    SVE_PREDICATE_TYPE_ALL:
+//    - (Name, MangledName, Id, SingletonId, NumEls, NF)
+//    Defining the macro covers the boolean vector types.
 //
 // where:
 //
 //  - Name is the name of the builtin type.
 //
+//  - MangledName is the mangled name of the builtin type.
+//
 //  - BuiltinType::Id is the enumerator defining the type.
 //
 //  - Context.SingletonId is the global singleton of this type.
 //
 //  - ElKind enumerates the type of the elements.
 //
+//  - NumEls enumerates the number of the elements.
+//
 //  - ElBits is the size of one element in bits.
 //
+//  - NF enumerates the number of sub-vectors.
+//    TODO: Tuple types are represented as a concatination of "NumEls x ElBits"
+//    vectors.  This will be changed to become a struct containing NF vectors.
+//
 //  - IsSigned is true for vectors of signed integer elements and
 //    for vectors of floating-point elements.
 //
@@ -39,102 +59,134 @@
 //===----------------------------------------------------------------------===//
 
 #ifndef SVE_VECTOR_TYPE
-#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits,    \
-                        IsSigned, IsFP, IsBF)                                  \
+#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \
   SVE_TYPE(Name, Id, SingletonId)
 #endif
 
+#ifndef SVE_VECTOR_TYPE_DETAILS
+#define SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned, IsFP, IsBF) \
+  SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId)
+#endif
+
+#ifndef SVE_VECTOR_TYPE_BFLOAT
+#define SVE_VECTOR_TYPE_BFLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
+  SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, true)
+#endif
+
+#ifndef SVE_VECTOR_TYPE_FLOAT
+#define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
+  SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, true, false)
+#endif
+
+#ifndef SVE_VECTOR_TYPE_INT
+#define SVE_VECTOR_TYPE_INT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned) \
+  SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned, false, false)
+#endif
+
 #ifndef SVE_PREDICATE_TYPE
-#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls)         \
+#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId) \
   SVE_TYPE(Name, Id, SingletonId)
 #endif
 
+#ifndef SVE_PREDICATE_TYPE_ALL
+#define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
+  SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)
+#endif
+
 #ifndef SVE_OPAQUE_TYPE
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)                    \
+#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) \
   SVE_TYPE(Name, Id, SingletonId)
 #endif
 
 //===- Vector point types -----------------------------------------------===//
 
+SVE_VECTOR_TYPE_INT("__SVInt8_t",  "__SVInt8_t",  SveInt8,  SveInt8Ty, 16,  8, 1, true)
+SVE_VECTOR_TYPE_INT("__SVInt16_t", "__SVInt16_t", SveInt16, SveInt16Ty, 8, 16, 1, true)
+SVE_VECTOR_TYPE_INT("__SVInt32_t", "__SVInt32_t", SveInt32, SveInt32Ty, 4, 32, 1, true)
+SVE_VECTOR_TYPE_INT("__SVInt64_t", "__SVInt64_t", SveInt64, SveInt64Ty, 2, 64, 1, true)
 
-SVE_VECTOR_TYPE("__SVInt8_t", "__SVInt8_t",  SveInt8, SveInt8Ty, 16, 8, true, false, false)
-SVE_VECTOR_TYPE("__SVInt16_t", "__SVInt16_t", SveInt16, SveInt16Ty, 8, 16, true, false, false)
-SVE_VECTOR_TYPE("__SVInt32_t", "__SVInt32_t", SveInt32, SveInt32Ty, 4, 32, true, false, false)
-SVE_VECTOR_TYPE("__SVInt64_t", "__SVInt64_t", SveInt64, SveInt64Ty, 2, 64, true, false, false)
-
-SVE_VECTOR_TYPE("__SVUint8_t", "__SVUint8_t",  SveUint8, SveUint8Ty, 16, 8, false, false, false)
-SVE_VECTOR_TYPE("__SVUint16_t", "__SVUint16_t", SveUint16, SveUint16Ty, 8, 16, false, false, false)
-SVE_VECTOR_TYPE("__SVUint32_t", "__SVUint32_t", SveUint32, SveUint32Ty, 4, 32, false, false, false)
-SVE_VECTOR_TYPE("__SVUint64_t", "__SVUint64_t", SveUint64, SveUint64Ty, 2, 64, false, false, false)
+SVE_VECTOR_TYPE_INT("__SVUint8_t",  "__SVUint8_t",  SveUint8,  SveUint8Ty, 16, 8, 1, false)
+SVE_VECTOR_TYPE_INT("__SVUint16_t", "__SVUint16_t", SveUint16, SveUint16Ty, 8, 16, 1, false)
+SVE_VECTOR_TYPE_INT("__SVUint32_t", "__SVUint32_t", SveUint32, SveUint32Ty, 4, 32, 1, false)
+SVE_VECTOR_TYPE_INT("__SVUint64_t", "__SVUint64_t", SveUint64, SveUint64Ty, 2, 64, 1, false)
 
-SVE_VECTOR_TYPE("__SVFloat16_t", "__SVFloat16_t", SveFloat16, SveFloat16Ty, 8, 16, true, true, false)
-SVE_VECTOR_TYPE("__SVFloat32_t", "__SVFloat32_t", SveFloat32, SveFloat32Ty, 4, 32, true, true, false)
-SVE_VECTOR_TYPE("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty, 2, 64, true, true, false)
+SVE_VECTOR_TYPE_FLOAT("__SVFloat16_t", "__SVFloat16_t", SveFloat16, SveFloat16Ty, 8, 16, 1)
+SVE_VECTOR_TYPE_FLOAT("__SVFloat32_t", "__SVFloat32_t", SveFloat32, SveFloat32Ty, 4, 32, 1)
+SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty, 2, 64, 1)
 
-SVE_VECTOR_TYPE("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, true, false, true)
+SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1)
 
 //
 // x2
 //
-SVE_VECTOR_TYPE("__clang_svint8x2_t", "svint8x2_t",  SveInt8x2, SveInt8x2Ty, 32, 8, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint16x2_t", "svint16x2_t", SveInt16x2, SveInt16x2Ty, 16, 16, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint32x2_t", "svint32x2_t", SveInt32x2, SveInt32x2Ty, 8, 32, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint64x2_t", "svint64x2_t", SveInt64x2, SveInt64x2Ty, 4, 64, true, false, false)
 
-SVE_VECTOR_TYPE("__clang_svuint8x2_t", "svuint8x2_t",  SveUint8x2, SveUint8x2Ty, 32, 8, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint16x2_t", "svuint16x2_t", SveUint16x2, SveUint16x2Ty, 16, 16, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint32x2_t", "svuint32x2_t", SveUint32x2, SveUint32x2Ty, 8, 32, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint64x2_t", "svuint64x2_t", SveUint64x2, SveUint64x2Ty, 4, 64, false, false, false)
+SVE_VECTOR_TYPE_INT("__clang_svint8x2_t",  "svint8x2_t",  SveInt8x2,  SveInt8x2Ty, 16, 8, 2, true)
+SVE_VECTOR_TYPE_INT("__clang_svint16x2_t", "svint16x2_t", SveInt16x2, SveInt16x2Ty, 8, 16, 2, true)
+SVE_VECTOR_TYPE_INT("__clang_svint32x2_t", "svint32x2_t", SveInt32x2, SveInt32x2Ty, 4, 32, 2, true)
+SVE_VECTOR_TYPE_INT("__clang_svint64x2_t", "svint64x2_t", SveInt64x2, SveInt64x2Ty, 2, 64, 2, true)
 
-SVE_VECTOR_TYPE("__clang_svfloat16x2_t", "svfloat16x2_t", SveFloat16x2, SveFloat16x2Ty, 16, 16, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat32x2_t", "svfloat32x2_t", SveFloat32x2, SveFloat32x2Ty, 8, 32, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, SveFloat64x2Ty, 4, 64, true, true, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint8x2_t",  "svuint8x2_t",  SveUint8x2,  SveUint8x2Ty, 16 , 8, 2, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint16x2_t", "svuint16x2_t", SveUint16x2, SveUint16x2Ty, 8, 16, 2, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint32x2_t", "svuint32x2_t", SveUint32x2, SveUint32x2Ty, 4, 32, 2, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint64x2_t", "svuint64x2_t", SveUint64x2, SveUint64x2Ty, 2, 64, 2, false)
+
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x2_t", "svfloat16x2_t", SveFloat16x2, SveFloat16x2Ty, 8, 16, 2)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x2_t", "svfloat32x2_t", SveFloat32x2, SveFloat32x2Ty, 4, 32, 2)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, SveFloat64x2Ty, 2, 64, 2)
+
+SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2)
 
-SVE_VECTOR_TYPE("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 16, 16, true, false, true)
 //
 // x3
 //
-SVE_VECTOR_TYPE("__clang_svint8x3_t", "svint8x3_t",  SveInt8x3, SveInt8x3Ty, 48, 8, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint16x3_t", "svint16x3_t", SveInt16x3, SveInt16x3Ty, 24, 16, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint32x3_t", "svint32x3_t", SveInt32x3, SveInt32x3Ty, 12, 32, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint64x3_t", "svint64x3_t", SveInt64x3, SveInt64x3Ty, 6, 64, true, false, false)
 
-SVE_VECTOR_TYPE("__clang_svuint8x3_t", "svuint8x3_t",  SveUint8x3, SveUint8x3Ty, 48, 8, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint16x3_t", "svuint16x3_t", SveUint16x3, SveUint16x3Ty, 24, 16, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint32x3_t", "svuint32x3_t", SveUint32x3, SveUint32x3Ty, 12, 32, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint64x3_t", "svuint64x3_t", SveUint64x3, SveUint64x3Ty, 6, 64, false, false, false)
+SVE_VECTOR_TYPE_INT("__clang_svint8x3_t",  "svint8x3_t",  SveInt8x3,  SveInt8x3Ty, 16,  8, 3, true)
+SVE_VECTOR_TYPE_INT("__clang_svint16x3_t", "svint16x3_t", SveInt16x3, SveInt16x3Ty, 8, 16, 3, true)
+SVE_VECTOR_TYPE_INT("__clang_svint32x3_t", "svint32x3_t", SveInt32x3, SveInt32x3Ty, 4, 32, 3, true)
+SVE_VECTOR_TYPE_INT("__clang_svint64x3_t", "svint64x3_t", SveInt64x3, SveInt64x3Ty, 2, 64, 3, true)
+
+SVE_VECTOR_TYPE_INT("__clang_svuint8x3_t",  "svuint8x3_t",  SveUint8x3,  SveUint8x3Ty, 16,  8, 3, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint16x3_t", "svuint16x3_t", SveUint16x3, SveUint16x3Ty, 8, 16, 3, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint32x3_t", "svuint32x3_t", SveUint32x3, SveUint32x3Ty, 4, 32, 3, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint64x3_t", "svuint64x3_t", SveUint64x3, SveUint64x3Ty, 2, 64, 3, false)
 
-SVE_VECTOR_TYPE("__clang_svfloat16x3_t", "svfloat16x3_t", SveFloat16x3, SveFloat16x3Ty, 24, 16, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat32x3_t", "svfloat32x3_t", SveFloat32x3, SveFloat32x3Ty, 12, 32, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, SveFloat64x3Ty, 6, 64, true, true, false)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x3_t", "svfloat16x3_t", SveFloat16x3, SveFloat16x3Ty, 8, 16, 3)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x3_t", "svfloat32x3_t", SveFloat32x3, SveFloat32x3Ty, 4, 32, 3)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, SveFloat64x3Ty, 2, 64, 3)
+
+SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3)
 
-SVE_VECTOR_TYPE("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 24, 16, true, false, true)
 //
 // x4
 //
-SVE_VECTOR_TYPE("__clang_svint8x4_t", "svint8x4_t",  SveInt8x4, SveInt8x4Ty, 64, 8, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint16x4_t", "svint16x4_t", SveInt16x4, SveInt16x4Ty, 32, 16, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint32x4_t", "svint32x4_t", SveInt32x4, SveInt32x4Ty, 16, 32, true, false, false)
-SVE_VECTOR_TYPE("__clang_svint64x4_t", "svint64x4_t", SveInt64x4, SveInt64x4Ty, 8, 64, true, false, false)
 
-SVE_VECTOR_TYPE("__clang_svuint8x4_t", "svuint8x4_t",  SveUint8x4, SveUint8x4Ty, 64, 8, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint16x4_t", "svuint16x4_t", SveUint16x4, SveUint16x4Ty, 32, 16, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint32x4_t", "svuint32x4_t", SveUint32x4, SveUint32x4Ty, 16, 32, false, false, false)
-SVE_VECTOR_TYPE("__clang_svuint64x4_t", "svuint64x4_t", SveUint64x4, SveUint64x4Ty, 8, 64, false, false, false)
+SVE_VECTOR_TYPE_INT("__clang_svint8x4_t",  "svint8x4_t",  SveInt8x4,  SveInt8x4Ty, 16,  8, 4, true)
+SVE_VECTOR_TYPE_INT("__clang_svint16x4_t", "svint16x4_t", SveInt16x4, SveInt16x4Ty, 8, 16, 4, true)
+SVE_VECTOR_TYPE_INT("__clang_svint32x4_t", "svint32x4_t", SveInt32x4, SveInt32x4Ty, 4, 32, 4, true)
+SVE_VECTOR_TYPE_INT("__clang_svint64x4_t", "svint64x4_t", SveInt64x4, SveInt64x4Ty, 2, 64, 4, true)
+
+SVE_VECTOR_TYPE_INT("__clang_svuint8x4_t",  "svuint8x4_t",  SveUint8x4,  SveUint8x4Ty, 16,  8, 4, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint16x4_t", "svuint16x4_t", SveUint16x4, SveUint16x4Ty, 8, 16, 4, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint32x4_t", "svuint32x4_t", SveUint32x4, SveUint32x4Ty, 4, 32, 4, false)
+SVE_VECTOR_TYPE_INT("__clang_svuint64x4_t", "svuint64x4_t", SveUint64x4, SveUint64x4Ty, 2, 64, 4, false)
 
-SVE_VECTOR_TYPE("__clang_svfloat16x4_t", "svfloat16x4_t", SveFloat16x4, SveFloat16x4Ty, 32, 16, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat32x4_t", "svfloat32x4_t", SveFloat32x4, SveFloat32x4Ty, 16, 32, true, true, false)
-SVE_VECTOR_TYPE("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, SveFloat64x4Ty, 8, 64, true, true, false)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x4_t", "svfloat16x4_t", SveFloat16x4, SveFloat16x4Ty, 8, 16, 4)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x4_t", "svfloat32x4_t", SveFloat32x4, SveFloat32x4Ty, 4, 32, 4)
+SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, SveFloat64x4Ty, 2, 64, 4)
 
-SVE_VECTOR_TYPE("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 32, 16, true, false, true)
+SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4)
 
-SVE_PREDICATE_TYPE("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16)
-SVE_PREDICATE_TYPE("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 32)
-SVE_PREDICATE_TYPE("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4Ty, 64)
+SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1)
+SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 16, 2)
+SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4Ty, 16, 4)
 
 SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy)
 
 #undef SVE_VECTOR_TYPE
+#undef SVE_VECTOR_TYPE_BFLOAT
+#undef SVE_VECTOR_TYPE_FLOAT
+#undef SVE_VECTOR_TYPE_INT
 #undef SVE_PREDICATE_TYPE
+#undef SVE_PREDICATE_TYPE_ALL
 #undef SVE_OPAQUE_TYPE
 #undef SVE_TYPE
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index fa9cc38efc466..8ece39a383046 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2203,13 +2203,12 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     // Because the length is only known at runtime, we use a dummy value
     // of 0 for the static length.  The alignment values are those defined
     // by the Procedure Call Standard for the Arm Architecture.
-#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits,    \
-                        IsSigned, IsFP, IsBF)                                  \
+#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                    \
   case BuiltinType::Id:                                                        \
     Width = 0;                                                                 \
     Align = 128;                                                               \
     break;
-#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls)         \
+#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
   case BuiltinType::Id:                                                        \
     Width = 0;                                                                 \
     Align = 16;                                                                \
@@ -4284,108 +4283,27 @@ ASTContext::getBuiltinVectorTypeInfo(const BuiltinType *Ty) const {
   switch (Ty->getKind()) {
   default:
     llvm_unreachable("Unsupported builtin vector type");
-  case BuiltinType::SveInt8:
-    return SVE_INT_ELTTY(8, 16, true, 1);
-  case BuiltinType::SveUint8:
-    return SVE_INT_ELTTY(8, 16, false, 1);
-  case BuiltinType::SveInt8x2:
-    return SVE_INT_ELTTY(8, 16, true, 2);
-  case BuiltinType::SveUint8x2:
-    return SVE_INT_ELTTY(8, 16, false, 2);
-  case BuiltinType::SveInt8x3:
-    return SVE_INT_ELTTY(8, 16, true, 3);
-  case BuiltinType::SveUint8x3:
-    return SVE_INT_ELTTY(8, 16, false, 3);
-  case BuiltinType::SveInt8x4:
-    return SVE_INT_ELTTY(8, 16, true, 4);
-  case BuiltinType::SveUint8x4:
-    return SVE_INT_ELTTY(8, 16, false, 4);
-  case BuiltinType::SveInt16:
-    return SVE_INT_ELTTY(16, 8, true, 1);
-  case BuiltinType::SveUint16:
-    return SVE_INT_ELTTY(16, 8, false, 1);
-  case BuiltinType::SveInt16x2:
-    return SVE_INT_ELTTY(16, 8, true, 2);
-  case BuiltinType::SveUint16x2:
-    return SVE_INT_ELTTY(16, 8, false, 2);
-  case BuiltinType::SveInt16x3:
-    return SVE_INT_ELTTY(16, 8, true, 3);
-  case BuiltinType::SveUint16x3:
-    return SVE_INT_ELTTY(16, 8, false, 3);
-  case BuiltinType::SveInt16x4:
-    return SVE_INT_ELTTY(16, 8, true, 4);
-  case BuiltinType::SveUint16x4:
-    return SVE_INT_ELTTY(16, 8, false, 4);
-  case BuiltinType::SveInt32:
-    return SVE_INT_ELTTY(32, 4, true, 1);
-  case BuiltinType::SveUint32:
-    return SVE_INT_ELTTY(32, 4, false, 1);
-  case BuiltinType::SveInt32x2:
-    return SVE_INT_ELTTY(32, 4, true, 2);
-  case BuiltinType::SveUint32x2:
-    return SVE_INT_ELTTY(32, 4, false, 2);
-  case BuiltinType::SveInt32x3:
-    return SVE_INT_ELTTY(32, 4, true, 3);
-  case BuiltinType::SveUint32x3:
-    return SVE_INT_ELTTY(32, 4, false, 3);
-  case BuiltinType::SveInt32x4:
-    return SVE_INT_ELTTY(32, 4, true, 4);
-  case BuiltinType::SveUint32x4:
-    return SVE_INT_ELTTY(32, 4, false, 4);
-  case BuiltinType::SveInt64:
-    return SVE_INT_ELTTY(64, 2, true, 1);
-  case BuiltinType::SveUint64:
-    return SVE_INT_ELTTY(64, 2, false, 1);
-  case BuiltinType::SveInt64x2:
-    return SVE_INT_ELTTY(64, 2, true, 2);
-  case BuiltinType::SveUint64x2:
-    return SVE_INT_ELTTY(64, 2, false, 2);
-  case BuiltinType::SveInt64x3:
-    return SVE_INT_ELTTY(64, 2, true, 3);
-  case BuiltinType::SveUint64x3:
-    return SVE_INT_ELTTY(64, 2, false, 3);
-  case BuiltinType::SveInt64x4:
-    return SVE_INT_ELTTY(64, 2, true, 4);
-  case BuiltinType::SveUint64x4:
-    return SVE_INT_ELTTY(64, 2, false, 4);
-  case BuiltinType::SveBool:
-    return SVE_ELTTY(BoolTy, 16, 1);
-  case BuiltinType::SveBoolx2:
-    return SVE_ELTTY(BoolTy, 16, 2);
-  case BuiltinType::SveBoolx4:
-    return SVE_ELTTY(BoolTy, 16, 4);
-  case BuiltinType::SveFloat16:
-    return SVE_ELTTY(HalfTy, 8, 1);
-  case BuiltinType::SveFloat16x2:
-    return SVE_ELTTY(HalfTy, 8, 2);
-  case BuiltinType::SveFloat16x3:
-    return SVE_ELTTY(HalfTy, 8, 3);
-  case BuiltinType::SveFloat16x4:
-    return SVE_ELTTY(HalfTy, 8, 4);
-  case BuiltinType::SveFloat32:
-    return SVE_ELTTY(FloatTy, 4, 1);
-  case BuiltinType::SveFloat32x2:
-    return SVE_ELTTY(FloatTy, 4, 2);
-  case BuiltinType::SveFloat32x3:
-    return SVE_ELTTY(FloatTy, 4, 3);
-  case BuiltinType::SveFloat32x4:
-    return SVE_ELTTY(FloatTy, 4, 4);
-  case BuiltinType::SveFloat64:
-    return SVE_ELTTY(DoubleTy, 2, 1);
-  case BuiltinType::SveFloat64x2:
-    return SVE_ELTTY(DoubleTy, 2, 2);
-  case BuiltinType::SveFloat64x3:
-    return SVE_ELTTY(DoubleTy, 2, 3);
-  case BuiltinType::SveFloat64x4:
-    return SVE_ELTTY(DoubleTy, 2, 4);
-  case BuiltinType::SveBFloat16:
-    return SVE_ELTTY(BFloat16Ty, 8, 1);
-  case BuiltinType::SveBFloat16x2:
-    return SVE_ELTTY(BFloat16Ty, 8, 2);
-  case BuiltinType::SveBFloat16x3:
-    return SVE_ELTTY(BFloat16Ty, 8, 3);
-  case BuiltinType::SveBFloat16x4:
-    return SVE_ELTTY(BFloat16Ty, 8, 4);
+
+#define SVE_VECTOR_TYPE_INT(Name, MangledName, Id, SingletonId, NumEls,        \
+                            ElBits, NF, IsSigned)                              \
+  case BuiltinType::Id:                                                        \
+    return {getIntTypeForBitwidth(ElBits, IsSigned),                           \
+            llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls,      \
+                              ElBits, NF)                                      \
+  case BuiltinType::Id:                                                        \
+    return {ElBits == 16 ? HalfTy : (ElBits == 32 ? FloatTy : DoubleTy),       \
+            llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_VECTOR_TYPE_BFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  case BuiltinType::Id:                                                        \
+    return {BFloat16Ty, llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
+  case BuiltinType::Id:                                                        \
+    return {BoolTy, llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+#include "clang/Basic/AArch64SVEACLETypes.def"
+
 #define RVV_VECTOR_TYPE_INT(Name, Id, SingletonId, NumEls, ElBits, NF,         \
                             IsSigned)                                          \
   case BuiltinType::Id:                                                        \
@@ -4425,22 +4343,30 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts,
                                            unsigned NumFields) const {
   if (Target->hasAArch64SVETypes()) {
     uint64_t EltTySize = getTypeSize(EltTy);
-#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits,    \
-                        IsSigned, IsFP, IsBF)                                  \
-  if (!EltTy->isBooleanType() &&                                               \
-      ((EltTy->hasIntegerRepresentation() &&                                   \
-        EltTy->hasSignedIntegerRepresentation() == IsSigned) ||                \
-       (EltTy->hasFloatingRepresentation() && !EltTy->isBFloat16Type() &&      \
-        IsFP && !IsBF) ||                                                      \
-       (EltTy->hasFloatingRepresentation() && EltTy->isBFloat16Type() &&       \
-        IsBF && !IsFP)) &&                                                     \
-      EltTySize == ElBits && NumElts == NumEls) {                              \
+
+#define SVE_VECTOR_TYPE_INT(Name, MangledName, Id, SingletonId, NumEls,        \
+                            ElBits, NF, IsSigned)                              \
+  if (EltTy->hasIntegerRepresentation() && !EltTy->isBooleanType() &&          \
+      EltTy->hasSignedIntegerRepresentation() == IsSigned &&                   \
+      EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) {     \
     return SingletonId;                                                        \
   }
-#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls)         \
-  if (EltTy->isBooleanType() && NumElts == NumEls)                             \
+#define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls,      \
+                              ElBits, NF)                                      \
+  if (EltTy->hasFloatingRepresentation() && !EltTy->isBFloat16Type() &&        \
+      EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) {     \
+    return SingletonId;                                                        \
+  }
+#define SVE_VECTOR_TYPE_BFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  if (EltTy->hasFloatingRepresentation() && EltTy->isBFloat16Type() &&         \
+      EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) {     \
+    return SingletonId;                                                        \
+  }
+#define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
+  if (EltTy->isBooleanType() && NumElts == (NumEls * NF) && NumFields == 1)    \
     return SingletonId;
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingleTonId)
+#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
   } else if (Target->hasRISCVVTypes()) {
     uint64_t EltTySize = getTypeSize(EltTy);
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 7d638befcbd3f..b6e1da0c3192d 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3384,8 +3384,7 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
   // The SVE types are effectively target-specific.  The mangling scheme
   // is defined in the appendices to the Procedure Call Standard for the
   // Arm Architecture.
-#define SVE_VECTOR_TYPE(InternalName, MangledName, Id, SingletonId, NumEls,    \
-                        ElBits, IsSigned, IsFP, IsBF)                          \
+#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                    \
   case BuiltinType::Id:                                                        \
     if (T->getKind() == BuiltinType::SveBFloat16 &&                            \
         isCompatibleWith(LangOptions::ClangABI::Ver17)) {                      \
@@ -3394,21 +3393,18 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
       Out << "u" << type_name.size() << type_name;                             \
     } else {                                                                   \
       type_name = MangledName;                                                 \
-      Out << (type_name == InternalName ? "u" : "") << type_name.size()        \
-          << type_name;                                                        \
+      Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;  \
     }                                                                          \
     break;
-#define SVE_PREDICATE_TYPE(InternalName, MangledName, Id, SingletonId, NumEls) \
+#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
   case BuiltinType::Id:                                                        \
     type_name = MangledName;                                                   \
-    Out << (type_name == InternalName ? "u" : "") << type_name.size()          \
-        << type_name;                                                          \
+    Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
     break;
-#define SVE_OPAQUE_TYPE(InternalName, MangledName, Id, SingletonId)            \
+#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)                    \
   case BuiltinType::Id:                                                        \
     type_name = MangledName;                                                   \
-    Out << (type_name == InternalName ? "u" : "") << type_name.size()          \
-        << type_name;                                                          \
+    Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
     break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size) \
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 11a577bbdd078..5eebd8ad2a065 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -500,63 +500,19 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     case BuiltinType::OCLReserveID:
       ResultType = CGM.getOpenCLRuntime().convertOpenCLSpecificType(Ty);
       break;
-    case BuiltinType::SveInt8:
-    case BuiltinType::SveUint8:
-    case BuiltinType::SveInt8x2:
-    case BuiltinType::SveUint8x2:
-    case BuiltinType::SveInt8x3:
-    case BuiltinType::SveUint8x3:
-    case BuiltinType::SveInt8x4:
-    case BuiltinType::SveUint8x4:
-    case BuiltinType::SveInt16:
-    case BuiltinType::SveUint16:
-    case BuiltinType::SveInt16x2:
-    case BuiltinType::SveUint16x2:
-    case BuiltinType::SveInt16x3:
-    case BuiltinType::SveUint16x3:
-    case BuiltinType::SveInt16x4:
-    case BuiltinType::SveUint16x4:
-    case BuiltinType::SveInt32:
-    case BuiltinType::SveUint32:
-    case BuiltinType::SveInt32x2:
-    case BuiltinType::SveUint32x2:
-    case BuiltinType::SveInt32x3:
-    case BuiltinType::SveUint32x3:
-    case BuiltinType::SveInt32x4:
-    case BuiltinType::SveUint32x4:
-    case BuiltinType::SveInt64:
-    case BuiltinType::SveUint64:
-    case BuiltinType::SveInt64x2:
-    case BuiltinType::SveUint64x2:
-    case BuiltinType::SveInt64x3:
-    case BuiltinType::SveUint64x3:
-    case BuiltinType::SveInt64x4:
-    case BuiltinType::SveUint64x4:
-    case BuiltinType::SveBool:
-    case BuiltinType::SveBoolx2:
-    case BuiltinType::SveBoolx4:
-    case BuiltinType::SveFloat16:
-    case BuiltinType::SveFloat16x2:
-    case BuiltinType::SveFloat16x3:
-    case BuiltinType::SveFloat16x4:
-    case BuiltinType::SveFloat32:
-    case BuiltinType::SveFloat32x2:
-    case BuiltinType::SveFloat32x3:
-    case BuiltinType::SveFloat32x4:
-    case BuiltinType::SveFloat64:
-    case BuiltinType::SveFloat64x2:
-    case BuiltinType::SveFloat64x3:
-    case BuiltinType::SveFloat64x4:
-    case BuiltinType::SveBFloat16:
-    case BuiltinType::SveBFloat16x2:
-    case BuiltinType::SveBFloat16x3:
-    case BuiltinType::SveBFloat16x4: {
-      ASTContext::BuiltinVectorTypeInfo Info =
-          Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
-      return llvm::ScalableVectorType::get(ConvertType(Info.ElementType),
-                                           Info.EC.getKnownMinValue() *
-                                               Info.NumVectors);
-    }
+#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                    \
+  case BuiltinType::Id:
+#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
+  case BuiltinType::Id:
+#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+#include "clang/Basic/AArch64SVEACLETypes.def"
+      {
+        ASTContext::BuiltinVectorTypeInfo Info =
+            Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
+        return llvm::ScalableVectorType::get(ConvertType(Info.ElementType),
+                                             Info.EC.getKnownMinValue() *
+                                                 Info.NumVectors);
+      }
     case BuiltinType::SveCount:
       return llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
 #define PPC_VECTOR_TYPE(Name, Id, Size) \

From 44fc987ed174e32544a577387ab0df6886495e82 Mon Sep 17 00:00:00 2001
From: Vladislav Dzhidzhoev <vdzhidzhoev@accesssoftek.com>
Date: Wed, 11 Sep 2024 16:04:01 +0300
Subject: [PATCH 099/114] [lldb][test] Toolchain detection rewrite in Python
 (#102185)

This fix is based on a problem with cxx_compiler and cxx_linker macros
on Windows.
There was an issue with compiler detection in paths containing "icc". In
such case, Makefile.rules thought it was provided with icc compiler.

To solve that, utilities detection has been rewritten in Python.
The last element of compiler's path is separated, taking into account
the platform path delimiter, and compiler type is extracted, with regard
of possible cross-toolchain prefix.

---------

Co-authored-by: Pavel Labath <pavel@labath.sk>
---
 .../Python/lldbsuite/test/builders/builder.py | 98 +++++++++++++++++--
 .../Python/lldbsuite/test/make/Makefile.rules | 92 ++++-------------
 .../breakpoint/breakpoint_ids/Makefile        |  2 +-
 .../breakpoint/breakpoint_locations/Makefile  |  2 +-
 .../consecutive_breakpoints/Makefile          |  2 +-
 .../functionalities/breakpoint/cpp/Makefile   |  2 +-
 .../dummy_target_breakpoints/Makefile         |  2 +-
 .../require_hw_breakpoints/Makefile           |  2 +-
 .../breakpoint/step_over_breakpoint/Makefile  |  2 +-
 .../thread_plan_user_breakpoint/Makefile      |  2 +-
 .../ObjCDataFormatterTestCase.py              |  4 +-
 .../TestNSDictionarySynthetic.py              |  4 +-
 .../nssetsynth/TestNSSetSynthetic.py          |  4 +-
 .../poarray/TestPrintObjectArray.py           |  4 +-
 .../functionalities/inline-stepping/Makefile  |  2 +-
 .../postmortem/minidump-new/makefile.txt      |  1 +
 .../lang/objc/orderedset/TestOrderedSet.py    |  4 +-
 .../TestObjCSingleEntryDictionary.py          |  4 +-
 lldb/test/API/macosx/macCatalyst/Makefile     |  1 +
 .../macCatalystAppMacOSFramework/Makefile     |  1 +
 .../macosx/simulator/TestSimulatorPlatform.py |  4 +-
 .../API/python_api/frame/inlines/Makefile     |  2 +-
 .../lldb-server/TestAppleSimulatorOSType.py   |  4 +-
 23 files changed, 139 insertions(+), 106 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py
index 4ea9a86c1d5fc..564918c58b6dd 100644
--- a/lldb/packages/Python/lldbsuite/test/builders/builder.py
+++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py
@@ -1,10 +1,12 @@
 import os
+import pathlib
 import platform
 import subprocess
 import sys
 import itertools
 
 import lldbsuite.test.lldbtest as lldbtest
+import lldbsuite.test.lldbplatformutil as lldbplatformutil
 import lldbsuite.test.lldbutil as lldbutil
 from lldbsuite.test import configuration
 from lldbsuite.test_event import build_exception
@@ -96,17 +98,101 @@ def getArchSpec(self, architecture):
         """
         return ["ARCH=" + architecture] if architecture else []
 
-    def getCCSpec(self, compiler):
+    def getToolchainSpec(self, compiler):
         """
-        Helper function to return the key-value string to specify the compiler
+        Helper function to return the key-value strings to specify the toolchain
         used for the make system.
         """
         cc = compiler if compiler else None
         if not cc and configuration.compiler:
             cc = configuration.compiler
-        if cc:
-            return ['CC="%s"' % cc]
-        return []
+
+        if not cc:
+            return []
+
+        cc = cc.strip()
+        cc_path = pathlib.Path(cc)
+
+        # We can get CC compiler string in the following formats:
+        #  [<tool>] <compiler>    - such as 'xrun clang', 'xrun /usr/bin/clang' & etc
+        #
+        # Where <compiler> could contain the following parts:
+        #   <simple-name>[.<exe-ext>]                           - sucn as 'clang', 'clang.exe' ('clang-cl.exe'?)
+        #   <target-triple>-<simple-name>[.<exe-ext>]           - such as 'armv7-linux-gnueabi-gcc'
+        #   <path>/<simple-name>[.<exe-ext>]                    - such as '/usr/bin/clang', 'c:\path\to\compiler\clang,exe'
+        #   <path>/<target-triple>-<simple-name>[.<exe-ext>]    - such as '/usr/bin/clang', 'c:\path\to\compiler\clang,exe'
+
+        cc_ext = cc_path.suffix
+        # Compiler name without extension
+        cc_name = cc_path.stem.split(" ")[-1]
+
+        # A kind of compiler (canonical name): clang, gcc, cc & etc.
+        cc_type = cc_name
+        # A triple prefix of compiler name: <armv7-none-linux-gnu->gcc
+        cc_prefix = ""
+        if not "clang-cl" in cc_name and not "llvm-gcc" in cc_name:
+            cc_name_parts = cc_name.split("-")
+            cc_type = cc_name_parts[-1]
+            if len(cc_name_parts) > 1:
+                cc_prefix = "-".join(cc_name_parts[:-1]) + "-"
+
+        # A kind of C++ compiler.
+        cxx_types = {
+            "icc": "icpc",
+            "llvm-gcc": "llvm-g++",
+            "gcc": "g++",
+            "cc": "c++",
+            "clang": "clang++",
+        }
+        cxx_type = cxx_types.get(cc_type, cc_type)
+
+        cc_dir = cc_path.parent
+
+        def getToolchainUtil(util_name):
+            return cc_dir / (cc_prefix + util_name + cc_ext)
+
+        cxx = getToolchainUtil(cxx_type)
+
+        util_names = {
+            "OBJCOPY": "objcopy",
+            "STRIP": "strip",
+            "ARCHIVER": "ar",
+            "DWP": "dwp",
+        }
+        utils = []
+
+        if not lldbplatformutil.platformIsDarwin():
+            if cc_type in ["clang", "cc", "gcc"]:
+                util_paths = {}
+                # Assembly a toolchain side tool cmd based on passed CC.
+                for var, name in util_names.items():
+                    # Do not override explicity specified tool from the cmd line.
+                    if not os.getenv(var):
+                        util_paths[var] = getToolchainUtil(name)
+                    else:
+                        util_paths[var] = os.getenv(var)
+                utils.extend(["AR=%s" % util_paths["ARCHIVER"]])
+
+                # Look for llvm-dwp or gnu dwp
+                if not lldbutil.which(util_paths["DWP"]):
+                    util_paths["DWP"] = getToolchainUtil("llvm-dwp")
+                if not lldbutil.which(util_paths["DWP"]):
+                    util_paths["DWP"] = lldbutil.which("llvm-dwp")
+                if not util_paths["DWP"]:
+                    util_paths["DWP"] = lldbutil.which("dwp")
+                    if not util_paths["DWP"]:
+                        del util_paths["DWP"]
+
+                for var, path in util_paths.items():
+                    utils.append("%s=%s" % (var, path))
+        else:
+            utils.extend(["AR=%slibtool" % os.getenv("CROSS_COMPILE", "")])
+
+        return [
+            "CC=%s" % cc,
+            "CC_TYPE=%s" % cc_type,
+            "CXX=%s" % cxx,
+        ] + utils
 
     def getSDKRootSpec(self):
         """
@@ -178,7 +264,7 @@ def getBuildCommand(
             make_targets,
             self.getArchCFlags(architecture),
             self.getArchSpec(architecture),
-            self.getCCSpec(compiler),
+            self.getToolchainSpec(compiler),
             self.getExtraMakeArgs(),
             self.getSDKRootSpec(),
             self.getModuleCacheSpec(),
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 1ba3f843e87cf..f81db9bc06d8a 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -102,15 +102,22 @@ endif
 # If you change the defaults of CC, be sure to also change it in the file
 # test/builders/builder_base.py, which provides a Python way to return the
 # value of the make variable CC -- getCompiler().
-#
-# See also these functions:
-#   o cxx_compiler
-#   o cxx_linker
 #----------------------------------------------------------------------
 ifeq "$(CC)" ""
 $(error "C compiler is not specified. Please run tests through lldb-dotest or lit")
 endif
 
+# Always override the linker. Assign already normalized CC.
+override LD := $(CC)
+# A kind of linker. It always gets retrieved from CC.
+override LDC := $(CC_TYPE)
+
+ifeq "$(HOST_OS)" "Windows_NT"
+       # This function enframes the full path with the platform specific quotes. This is necessary to run the c++ executable
+       # properly under 'sh' on Windows host (prevent the path breakage because of Windows style path separators).
+       override CXX := $(QUOTE)$(CXX)$(QUOTE)
+endif
+
 #----------------------------------------------------------------------
 # Handle SDKROOT for the cross platform builds.
 #----------------------------------------------------------------------
@@ -147,10 +154,8 @@ ifeq "$(OS)" "Darwin"
 	DS := $(DSYMUTIL)
 	DSFLAGS := $(DSFLAGS_EXTRAS)
 	DSYM = $(EXE).dSYM
-	AR := $(CROSS_COMPILE)libtool
 	ARFLAGS := -static -o
 else
-	AR := $(CROSS_COMPILE)ar
 	# On non-Apple platforms, -arch becomes -m
 	ARCHFLAG := -m
 
@@ -213,7 +218,7 @@ endif
 LIMIT_DEBUG_INFO_FLAGS =
 NO_LIMIT_DEBUG_INFO_FLAGS =
 MODULE_DEBUG_INFO_FLAGS =
-ifneq (,$(findstring clang,$(CC)))
+ifeq ($(CC_TYPE), clang)
    LIMIT_DEBUG_INFO_FLAGS += -flimit-debug-info
    NO_LIMIT_DEBUG_INFO_FLAGS += -fno-limit-debug-info
    MODULE_DEBUG_INFO_FLAGS += -gmodules
@@ -279,7 +284,6 @@ endif
 
 CFLAGS += $(CFLAGS_EXTRAS)
 CXXFLAGS += -std=c++11 $(CFLAGS) $(ARCH_CXXFLAGS)
-LD = $(CC)
 # Copy common options to the linker flags (dwarf, arch. & etc).
 # Note: we get some 'garbage' options for linker here (such as -I, --isystem & etc).
 LDFLAGS += $(CFLAGS)
@@ -312,61 +316,6 @@ ifneq "$(DYLIB_NAME)" ""
 	endif
 endif
 
-# Function that returns the counterpart C++ compiler, given $(CC) as arg.
-cxx_compiler_notdir = $(if $(findstring icc,$(1)), \
-			$(subst icc,icpc,$(1)), \
-			$(if $(findstring llvm-gcc,$(1)), \
-				$(subst llvm-gcc,llvm-g++,$(1)), \
-				$(if $(findstring gcc,$(1)), \
-					$(subst gcc,g++,$(1)), \
-					$(subst cc,c++,$(1)))))
-cxx_compiler = $(if $(findstring /,$(1)),$(join $(dir $(1)), $(call cxx_compiler_notdir,$(notdir $(1)))),$(call cxx_compiler_notdir,$(1)))
-
-# Function that returns the C++ linker, given $(CC) as arg.
-cxx_linker_notdir = $(if $(findstring icc,$(1)), \
-			$(subst icc,icpc,$(1)), \
-			$(if $(findstring llvm-gcc,$(1)), \
-				$(subst llvm-gcc,llvm-g++,$(1)), \
-				$(if $(findstring gcc,$(1)), \
-					$(subst gcc,g++,$(1)), \
-					$(subst cc,c++,$(1)))))
-cxx_linker = $(if $(findstring /,$(1)),$(join $(dir $(1)), $(call cxx_linker_notdir,$(notdir $(1)))),$(call cxx_linker_notdir,$(1)))
-
-ifneq "$(OS)" "Darwin"
-	CLANG_OR_GCC := $(strip $(if $(findstring clang,$(CC)), \
-				$(findstring clang,$(CC)), \
-				$(if $(findstring gcc,$(CC)), \
-					$(findstring gcc,$(CC)), \
-					cc)))
-
-	CC_LASTWORD := $(strip $(lastword $(subst -, ,$(CC))))
-
-	replace_with = $(strip $(if $(findstring $(3),$(CC_LASTWORD)), \
-			$(subst $(3),$(1),$(2)), \
-			$(subst $(3),$(1),$(subst -$(CC_LASTWORD),,$(2)))))
-
-	ifeq "$(notdir $(CC))" "$(CC)"
-		replace_cc_with = $(call replace_with,$(1),$(CC),$(CLANG_OR_GCC))
-	else
-		replace_cc_with = $(join $(dir $(CC)),$(call replace_with,$(1),$(notdir $(CC)),$(CLANG_OR_GCC)))
-	endif
-
-	OBJCOPY ?= $(call replace_cc_with,objcopy)
-	ARCHIVER ?= $(call replace_cc_with,ar)
-	# Look for llvm-dwp or gnu dwp
-	DWP ?= $(call replace_cc_with,llvm-dwp)
-	ifeq ($(wildcard $(DWP)),)
-		DWP = $(call replace_cc_with,dwp)
-		ifeq ($(wildcard $(DWP)),)
-			DWP = $(shell command -v llvm-dwp 2> /dev/null)
-			ifeq ($(wildcard $(DWP)),)
-				DWP = $(shell command -v dwp 2> /dev/null)
-			endif
-		endif
-	endif
-	override AR = $(ARCHIVER)
-endif
-
 ifdef PIE
 	LDFLAGS += -pie
 endif
@@ -375,7 +324,7 @@ endif
 # Windows specific options
 #----------------------------------------------------------------------
 ifeq "$(OS)" "Windows_NT"
-	ifneq (,$(findstring clang,$(CC)))
+	ifeq ($(CC_TYPE), clang)
 		# Clang for Windows doesn't support C++ Exceptions
 		CXXFLAGS += -fno-exceptions
 		CXXFLAGS += -D_HAS_EXCEPTIONS=0
@@ -420,7 +369,7 @@ endif
 
 ifeq (1,$(USE_LIBSTDCPP))
 	# Clang requires an extra flag: -stdlib=libstdc++
-	ifneq (,$(findstring clang,$(CC)))
+	ifeq ($(CC_TYPE), clang)
 		# Force clang looking for the gcc's headers at specific rootfs folder.
 		CXXFLAGS += -stdlib=libstdc++ $(GCC_TOOLCHAIN_FLAGS)
 		LDFLAGS += -stdlib=libstdc++ $(GCC_TOOLCHAIN_FLAGS)
@@ -458,7 +407,7 @@ ifeq (1, $(USE_SYSTEM_STDLIB))
         CXXFLAGS += -nostdlib++ -nostdinc++ -cxx-isystem $(SDKROOT)/usr/include/c++/v1
         LDFLAGS += -L$(SDKROOT)/usr/lib -Wl,-rpath,$(SDKROOT)/usr/lib -lc++
     else
-        ifneq (,$(findstring clang,$(CC)))
+        ifeq ($(CC_TYPE),clang)
             # Force clang looking for the gcc's headers at specific rootfs folder.
             CXXFLAGS += $(GCC_TOOLCHAIN_FLAGS)
             LDFLAGS += $(GCC_TOOLCHAIN_FLAGS)
@@ -485,8 +434,6 @@ DYLIB_OBJECTS +=$(strip $(DYLIB_C_SOURCES:.c=.o))
 DYLIB_OBJECTS +=$(strip $(DYLIB_OBJC_SOURCES:.m=.o))
 ifneq "$(strip $(DYLIB_CXX_SOURCES))" ""
 	DYLIB_OBJECTS +=$(strip $(patsubst %.mm, %.o, $(DYLIB_CXX_SOURCES:.cpp=.o)))
-	CXX = $(call cxx_compiler,$(CC))
-	LD = $(call cxx_linker,$(CC))
 endif
 
 #----------------------------------------------------------------------
@@ -509,8 +456,6 @@ endif
 #----------------------------------------------------------------------
 ifneq "$(strip $(CXX_SOURCES))" ""
 	OBJECTS +=$(strip $(CXX_SOURCES:.cpp=.o))
-	CXX = $(call cxx_compiler,$(CC))
-	LD = $(call cxx_linker,$(CC))
 endif
 
 #----------------------------------------------------------------------
@@ -526,19 +471,18 @@ endif
 #----------------------------------------------------------------------
 ifneq "$(strip $(OBJCXX_SOURCES))" ""
 	OBJECTS +=$(strip $(OBJCXX_SOURCES:.mm=.o))
-	CXX = $(call cxx_compiler,$(CC))
-	LD = $(call cxx_linker,$(CC))
 	ifeq "$(findstring lobjc,$(LDFLAGS))" ""
 		LDFLAGS +=-lobjc
 	endif
 endif
 
-ifeq ($(findstring clang, $(CXX)), clang)
+ifeq ($(CC_TYPE), clang)
 	CXXFLAGS += --driver-mode=g++
 endif
 
 ifneq "$(CXX)" ""
-	ifeq ($(findstring clang, $(LD)), clang)
+	# Specify the driver mode parameter if we use clang as the linker.
+	ifeq ($(LDC), clang)
 		LDFLAGS += --driver-mode=g++
 	endif
 endif
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile b/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile
index 2c00681fa2280..778d3e58ab56f 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile
@@ -1,6 +1,6 @@
 CXX_SOURCES := main.cpp
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile
index 9645fd9cc8dfb..304633c2dca1f 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile
@@ -1,6 +1,6 @@
 C_SOURCES := main.c
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile b/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile
index 2c00681fa2280..778d3e58ab56f 100644
--- a/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile
@@ -1,6 +1,6 @@
 CXX_SOURCES := main.cpp
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/cpp/Makefile b/lldb/test/API/functionalities/breakpoint/cpp/Makefile
index 66108b79e7fe0..3b4be01d551f4 100644
--- a/lldb/test/API/functionalities/breakpoint/cpp/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/cpp/Makefile
@@ -1,7 +1,7 @@
 CXX_SOURCES := main.cpp
 CXXFLAGS_EXTRAS := -std=c++14
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile b/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile
index 9645fd9cc8dfb..304633c2dca1f 100644
--- a/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile
@@ -1,6 +1,6 @@
 C_SOURCES := main.c
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile
index 9645fd9cc8dfb..304633c2dca1f 100644
--- a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile
@@ -1,6 +1,6 @@
 C_SOURCES := main.c
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile
index 2c00681fa2280..778d3e58ab56f 100644
--- a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile
@@ -1,6 +1,6 @@
 CXX_SOURCES := main.cpp
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile b/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile
index 2c00681fa2280..778d3e58ab56f 100644
--- a/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile
@@ -1,6 +1,6 @@
 CXX_SOURCES := main.cpp
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py
index a0d6802b3a506..c1cd9556c5ef3 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py
@@ -16,12 +16,12 @@ def appkit_tester_impl(self, commands, use_constant_classes):
             self.build()
         else:
             disable_constant_classes = {
-                "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
                 "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
                 + "-fno-constant-nsarray-literals "
                 + "-fno-constant-nsdictionary-literals",
             }
-            self.build(dictionary=disable_constant_classes)
+            # FIXME: Remove compiler when flags are available upstream.
+            self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.appkit_common_data_formatters_command()
         commands()
 
diff --git a/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py b/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py
index 9ac41d67eb9ab..e1d7e42bdd1a9 100644
--- a/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py
+++ b/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py
@@ -26,12 +26,12 @@ def test_rdar11988289_with_run_command(self):
     def test_rdar11988289_with_run_command_no_const(self):
         """Test that NSDictionary reports its synthetic children properly."""
         disable_constant_classes = {
-            "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
             "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
             + "-fno-constant-nsarray-literals "
             + "-fno-constant-nsdictionary-literals",
         }
-        self.build(dictionary=disable_constant_classes)
+        # FIXME: Remove when flags are available upstream.
+        self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.run_tests()
 
     def run_tests(self):
diff --git a/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py b/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py
index 053ec0ee9757e..1037e75c17eb3 100644
--- a/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py
+++ b/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py
@@ -26,12 +26,12 @@ def test_rdar12529957_with_run_command(self):
     def test_rdar12529957_with_run_command_no_const(self):
         """Test that NSSet reports its synthetic children properly."""
         disable_constant_classes = {
-            "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
             "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
             + "-fno-constant-nsarray-literals "
             + "-fno-constant-nsdictionary-literals",
         }
-        self.build(dictionary=disable_constant_classes)
+        # FIXME: Remove compiler when flags are available upstream.
+        self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.run_tests()
 
     def run_tests(self):
diff --git a/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py b/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py
index fff37829cd20d..db86f48f8ec1f 100644
--- a/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py
+++ b/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py
@@ -20,13 +20,13 @@ def test_print_array(self):
     def test_print_array_no_const(self):
         """Test that expr -O -Z works"""
         disable_constant_classes = {
-            "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
             "USE_SYSTEM_STDLIB": "1",  # See above.
             "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
             + "-fno-constant-nsarray-literals "
             + "-fno-constant-nsdictionary-literals",
         }
-        self.build(dictionary=disable_constant_classes)
+        # FIXME: Remove compiler when flags are available upstream.
+        self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.printarray_data_formatter_commands()
 
     def setUp(self):
diff --git a/lldb/test/API/functionalities/inline-stepping/Makefile b/lldb/test/API/functionalities/inline-stepping/Makefile
index 362b89d7f995b..bf646c7b7db33 100644
--- a/lldb/test/API/functionalities/inline-stepping/Makefile
+++ b/lldb/test/API/functionalities/inline-stepping/Makefile
@@ -1,6 +1,6 @@
 CXX_SOURCES := calling.cpp
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CXXFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt b/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt
index 7096efabdcfe1..d594b585b2d5f 100644
--- a/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt
@@ -19,6 +19,7 @@
 # to generate a Minidump when the binary crashes/requests such.
 #
 CC=g++
+CC_TYPE=gcc
 FLAGS=-g --std=c++11
 INCLUDE=-I$HOME/breakpad/src/src/
 LINK=-L. -lbreakpad -lpthread -nostdlib -lc -lstdc++ -lgcc_s -fno-exceptions
diff --git a/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py b/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py
index 14bfc322979b3..a7d6d9d155efc 100644
--- a/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py
+++ b/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py
@@ -12,12 +12,12 @@ def test_ordered_set(self):
     @skipUnlessDarwin
     def test_ordered_set_no_const(self):
         disable_constant_classes = {
-            "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
             "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
             + "-fno-constant-nsarray-literals "
             + "-fno-constant-nsdictionary-literals",
         }
-        self.build(dictionary=disable_constant_classes)
+        # FIXME: Remove when flags are available upstream.
+        self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.run_tests()
 
     def run_tests(self):
diff --git a/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py b/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py
index 68c0af76b8e3b..8debe731dfe1a 100644
--- a/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py
+++ b/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py
@@ -28,12 +28,12 @@ def test_single_entry_dict(self):
     )  # bug in NSDictionary formatting on watchos
     def test_single_entry_dict_no_const(self):
         disable_constant_classes = {
-            "CC": "xcrun clang",  # FIXME: Remove when flags are available upstream.
             "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals "
             + "-fno-constant-nsarray-literals "
             + "-fno-constant-nsdictionary-literals",
         }
-        self.build(dictionary=disable_constant_classes)
+        # FIXME: Remove compiler when flags are available upstream.
+        self.build(dictionary=disable_constant_classes, compiler="xcrun clang")
         self.run_tests()
 
     def run_tests(self):
diff --git a/lldb/test/API/macosx/macCatalyst/Makefile b/lldb/test/API/macosx/macCatalyst/Makefile
index 3f084968a2d57..ef17d89d2372c 100644
--- a/lldb/test/API/macosx/macCatalyst/Makefile
+++ b/lldb/test/API/macosx/macCatalyst/Makefile
@@ -7,6 +7,7 @@ USE_SYSTEM_STDLIB := 1
 
 # FIXME: rdar://problem/54986190
 # There is a Clang driver change missing on llvm.org.
+override CC_TYPE=clang
 override CC=xcrun clang
 
 include Makefile.rules
diff --git a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
index b24fe3f574ccf..c77a186724fda 100644
--- a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
+++ b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
@@ -5,6 +5,7 @@ override TRIPLE := $(ARCH)-apple-ios13.0-macabi
 CFLAGS_EXTRAS := -target $(TRIPLE)
 
 # FIXME: rdar://problem/54986190
+override CC_TYPE=clang
 override CC=xcrun clang
 
 all: libfoo.dylib a.out
diff --git a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
index b712afdd7560a..3f5645a486bcb 100644
--- a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
+++ b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
@@ -59,10 +59,10 @@ def run_with(self, arch, os, vers, env, expected_load_command):
         self.build(
             dictionary={
                 "ARCH": arch,
-                "CC": clang,
                 "ARCH_CFLAGS": "-target {} {}".format(triple, version_min),
                 "SDKROOT": sdk_root,
-            }
+            },
+            compiler=clang,
         )
 
         self.check_load_commands(expected_load_command)
diff --git a/lldb/test/API/python_api/frame/inlines/Makefile b/lldb/test/API/python_api/frame/inlines/Makefile
index e6d9d8310a0fa..cf17569a5e351 100644
--- a/lldb/test/API/python_api/frame/inlines/Makefile
+++ b/lldb/test/API/python_api/frame/inlines/Makefile
@@ -1,6 +1,6 @@
 C_SOURCES := inlines.c
 
-ifneq (,$(findstring icc,$(CC)))
+ifeq ($(CC_TYPE), icc)
     CFLAGS_EXTRAS := -debug inline-debug-info
 endif
 
diff --git a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
index 16297efe14372..ed47f94e9492b 100644
--- a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
+++ b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
@@ -71,12 +71,12 @@ def check_simulator_ostype(self, sdk, platform_name, arch=platform.machine()):
         self.build(
             dictionary={
                 "EXE": exe_name,
-                "CC": clang,
                 "SDKROOT": sdkroot.strip(),
                 "ARCH": arch,
                 "ARCH_CFLAGS": "-target {} {}".format(triple, version_min),
                 "USE_SYSTEM_STDLIB": 1,
-            }
+            },
+            compiler=clang,
         )
         exe_path = os.path.realpath(self.getBuildArtifact(exe_name))
         cmd = [

From ba4bcce5f5ffa9e7d4af72c20fe4f1baf97075fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett@gmail.com>
Date: Wed, 11 Sep 2024 15:04:55 +0200
Subject: [PATCH 100/114] [GlobalIsel] Combine trunc of binop (#107721)

trunc (binop X, C) --> binop (trunc X, trunc C)  --> binop (trunc X, C`)

Try to narrow the width of math or bitwise logic instructions by pulling
a truncate ahead of binary operators.

Vx and Nx cores consider 32-bit and 64-bit basic arithmetic equal in
costs.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |    6 +
 .../include/llvm/Target/GlobalISel/Combine.td |   36 +-
 .../GlobalISel/CombinerHelperCasts.cpp        |   46 +
 .../GlobalISel/combine-narrow-binop.mir       |  136 +
 .../AArch64/GlobalISel/inline-memset.mir      |    8 +-
 ...izer-combiner-divrem-insertpt-conflict.mir |    9 +-
 .../AMDGPU/GlobalISel/combine-itofp.mir       |   16 +-
 .../AMDGPU/GlobalISel/combine-zext-trunc.mir  |   26 +-
 ...-divergent-i1-phis-no-lane-mask-merging.ll |    7 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 3021 +++++++++--------
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 2940 ++++++++--------
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll     |   73 +-
 .../AMDGPU/GlobalISel/shl-ext-reduce.ll       |    3 +
 .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll     |   64 +-
 .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll     |   18 +-
 .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll     |   14 +-
 llvm/test/CodeGen/AMDGPU/constrained-shift.ll |    2 -
 llvm/test/CodeGen/AMDGPU/ctlz.ll              |    4 +-
 18 files changed, 3396 insertions(+), 3033 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9b62d6067be39..828532dcffb7d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -831,6 +831,12 @@ class CombinerHelper {
   /// Combine ors.
   bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// trunc (binop X, C) --> binop (trunc X, trunc C).
+  bool matchNarrowBinop(const MachineInstr &TruncMI,
+                        const MachineInstr &BinopMI, BuildFnTy &MatchInfo);
+
+  bool matchCastOfInteger(const MachineInstr &CastMI, APInt &MatchInfo);
+
   /// Combine addos.
   bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo);
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 525cc815e73ce..a595a51d7b01f 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1867,6 +1867,33 @@ class buildvector_of_opcode<Instruction castOpcode> : GICombineRule <
 
 def buildvector_of_truncate : buildvector_of_opcode<G_TRUNC>;
 
+// narrow binop.
+// trunc (binop X, C) --> binop (trunc X, trunc C)
+class narrow_binop_opcode<Instruction binopOpcode> : GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (G_CONSTANT $const, $imm),
+         (binopOpcode $binop, $x, $const):$Binop,
+         (G_TRUNC $root, $binop):$Trunc,
+         [{ return Helper.matchNarrowBinop(*${Trunc}, *${Binop}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${Trunc}, ${matchinfo}); }])>;
+
+def narrow_binop_add : narrow_binop_opcode<G_ADD>;
+def narrow_binop_sub : narrow_binop_opcode<G_SUB>;
+def narrow_binop_mul : narrow_binop_opcode<G_MUL>;
+def narrow_binop_and : narrow_binop_opcode<G_AND>;
+def narrow_binop_or  : narrow_binop_opcode<G_OR>;
+def narrow_binop_xor : narrow_binop_opcode<G_XOR>;
+
+// Cast of integer.
+class integer_of_opcode<Instruction castOpcode> : GICombineRule <
+  (defs root:$root, apint_matchinfo:$matchinfo),
+  (match (G_CONSTANT $int, $imm),
+         (castOpcode $root, $int):$Cast,
+         [{ return Helper.matchCastOfInteger(*${Cast}, ${matchinfo}); }]),
+  (apply [{ Helper.replaceInstWithConstant(*${Cast}, ${matchinfo}); }])>;
+
+def integer_of_truncate : integer_of_opcode<G_TRUNC>;
+
 def cast_combines: GICombineGroup<[
   truncate_of_zext,
   truncate_of_sext,
@@ -1881,7 +1908,14 @@ def cast_combines: GICombineGroup<[
   anyext_of_anyext,
   anyext_of_zext,
   anyext_of_sext,
-  buildvector_of_truncate
+  buildvector_of_truncate,
+  narrow_binop_add,
+  narrow_binop_sub,
+  narrow_binop_mul,
+  narrow_binop_and,
+  narrow_binop_or,
+  narrow_binop_xor,
+  integer_of_truncate
 ]>;
 
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
index 8714fdabf6549..30557e6a2304e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -313,3 +313,49 @@ bool CombinerHelper::matchCastOfBuildVector(const MachineInstr &CastMI,
 
   return true;
 }
+
+bool CombinerHelper::matchNarrowBinop(const MachineInstr &TruncMI,
+                                      const MachineInstr &BinopMI,
+                                      BuildFnTy &MatchInfo) {
+  const GTrunc *Trunc = cast<GTrunc>(&TruncMI);
+  const GBinOp *BinOp = cast<GBinOp>(&BinopMI);
+
+  if (!MRI.hasOneNonDBGUse(BinOp->getReg(0)))
+    return false;
+
+  Register Dst = Trunc->getReg(0);
+  LLT DstTy = MRI.getType(Dst);
+
+  // Is narrow binop legal?
+  if (!isLegalOrBeforeLegalizer({BinOp->getOpcode(), {DstTy}}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto LHS = B.buildTrunc(DstTy, BinOp->getLHSReg());
+    auto RHS = B.buildTrunc(DstTy, BinOp->getRHSReg());
+    B.buildInstr(BinOp->getOpcode(), {Dst}, {LHS, RHS});
+  };
+
+  return true;
+}
+
+bool CombinerHelper::matchCastOfInteger(const MachineInstr &CastMI,
+                                        APInt &MatchInfo) {
+  const GExtOrTruncOp *Cast = cast<GExtOrTruncOp>(&CastMI);
+
+  APInt Input = getIConstantFromReg(Cast->getSrcReg(), MRI);
+
+  LLT DstTy = MRI.getType(Cast->getReg(0));
+
+  if (!isConstantLegalOrBeforeLegalizer(DstTy))
+    return false;
+
+  switch (Cast->getOpcode()) {
+  case TargetOpcode::G_TRUNC: {
+    MatchInfo = Input.trunc(DstTy.getScalarSizeInBits());
+    return true;
+  }
+  default:
+    return false;
+  }
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir
new file mode 100644
index 0000000000000..f207e9c149a47
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir
@@ -0,0 +1,136 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs  %s | FileCheck %s --check-prefixes=CHECK
+
+---
+name:            test_combine_trunc_xor_i64
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_xor_i64
+    ; CHECK: %lhs:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_XOR [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = G_CONSTANT i64 5
+    %res:_(s64) = G_XOR %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_add_i64
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_add_i64
+    ; CHECK: %lhs:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_ADD [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = G_CONSTANT i64 5
+    %res:_(s64) = G_ADD %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_mul_i64
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_mul_i64
+    ; CHECK: %lhs:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_MUL [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = G_CONSTANT i64 5
+    %res:_(s64) = G_MUL %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_and_i64
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_and_i64
+    ; CHECK: %lhs:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = G_CONSTANT i64 5
+    %res:_(s64) = G_AND %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_or_i64
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_or_i64
+    ; CHECK: %lhs:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_OR [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = G_CONSTANT i64 5
+    %res:_(s64) = G_OR %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_sub_i128
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_sub_i128
+    ; CHECK: %lhs:_(s128) = COPY $q0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s128)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %small:_(s32) = G_SUB [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s128) = COPY $q0
+    %rhs:_(s128) = G_CONSTANT i128 5
+    %res:_(s128) = G_SUB %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s128)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_sub_i128_multi_use
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_sub_i128_multi_use
+    ; CHECK: %lhs:_(s128) = COPY $q0
+    ; CHECK-NEXT: %rhs:_(s128) = G_CONSTANT i128 5
+    ; CHECK-NEXT: %res:_(s128) = G_SUB %lhs, %rhs
+    ; CHECK-NEXT: %small:_(s32) = G_TRUNC %res(s128)
+    ; CHECK-NEXT: $q0 = COPY %res(s128)
+    ; CHECK-NEXT: $w0 = COPY %small(s32)
+    %lhs:_(s128) = COPY $q0
+    %rhs:_(s128) = G_CONSTANT i128 5
+    %res:_(s128) = G_SUB %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s128)
+    $q0 = COPY %res(s128)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_trunc_xor_vector_pattern_did_not_match
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_xor_vector_pattern_did_not_match
+    ; CHECK: %arg1:_(s64) = COPY $x0
+    ; CHECK-NEXT: %arg2:_(s64) = COPY $x0
+    ; CHECK-NEXT: %lhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    ; CHECK-NEXT: %rhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    ; CHECK-NEXT: %res:_(<2 x s64>) = G_XOR %lhs, %rhs
+    ; CHECK-NEXT: %small:_(<2 x s16>) = G_TRUNC %res(<2 x s64>)
+    ; CHECK-NEXT: $w0 = COPY %small(<2 x s16>)
+    %arg1:_(s64) = COPY $x0
+    %arg2:_(s64) = COPY $x0
+    %lhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    %rhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    %res:_(<2 x s64>) = G_XOR %lhs, %rhs
+    %small:_(<2 x s16>) = G_TRUNC %res(<2 x s64>)
+    $w0 = COPY %small(<2 x s16>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
index fee5afd3ddbb2..9ed1e2d9eee3b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
@@ -224,10 +224,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; CHECK-NEXT: G_STORE [[C]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[PTR_ADD1]](p0) :: (store (s16) into %ir.dst + 16, align 1)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 16448
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; CHECK-NEXT: G_STORE [[C2]](s16), [[PTR_ADD1]](p0) :: (store (s16) into %ir.dst + 16, align 1)
     ; CHECK-NEXT: RET_ReallyLR
     %0:_(p0) = COPY $x0
     %1:_(s8) = G_CONSTANT i8 64
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
index e51d9bd13163b..a87ff305d1535 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
@@ -8,9 +8,8 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     ; CHECK-LABEL: name: test
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
-    ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: $w0 = COPY [[C]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(s16) = G_CONSTANT i16 0
     %2:_(s1) = G_CONSTANT i1 true
@@ -41,9 +40,7 @@ body:             |
   bb.1:
     ; CHECK-LABEL: name: test_inverted_div_rem
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
-    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s8)
-    ; CHECK-NEXT: $w0 = COPY [[SEXT]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[C]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(s16) = G_CONSTANT i16 0
     %2:_(s1) = G_CONSTANT i1 true
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
index e4f11dfa9e027..d6135d86022be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
@@ -193,10 +193,10 @@ body:             |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_CONSTANT i64 255
@@ -216,10 +216,10 @@ body:             |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_CONSTANT i64 255
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
index 3b914df7f8f8a..3423af64162e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
@@ -12,9 +12,11 @@ body: |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %var:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: %c3FFF:_(s32) = G_CONSTANT i32 16383
-    ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %c3FFF
-    ; GCN-NEXT: $vgpr0 = COPY %low_bits(s32)
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32)
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383
+    ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]]
+    ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %zext(s32)
     %var:_(s32) = COPY $vgpr0
     %c3FFF:_(s32) = G_CONSTANT i32 16383
     %low_bits:_(s32) = G_AND %var, %c3FFF
@@ -34,10 +36,8 @@ body: |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %var:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: %cFFFFF:_(s32) = G_CONSTANT i32 1048575
-    ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %cFFFFF
-    ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s32)
-    ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16)
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32)
+    ; GCN-NEXT: %zext:_(s32) = G_ZEXT [[TRUNC]](s16)
     ; GCN-NEXT: $vgpr0 = COPY %zext(s32)
     %var:_(s32) = COPY $vgpr0
     %cFFFFF:_(s32) = G_CONSTANT i32 1048575
@@ -58,9 +58,9 @@ body: |
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %var:_(s64) = COPY $vgpr0_vgpr1
-    ; GCN-NEXT: %c3FFF:_(s64) = G_CONSTANT i64 16383
-    ; GCN-NEXT: %low_bits:_(s64) = G_AND %var, %c3FFF
-    ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s64)
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s64)
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383
+    ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16)
     ; GCN-NEXT: $vgpr0 = COPY %zext(s32)
     %var:_(s64) = COPY $vgpr0_vgpr1
@@ -82,9 +82,9 @@ body: |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %var:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: %c3FFF:_(s32) = G_CONSTANT i32 16383
-    ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %c3FFF
-    ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s32)
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32)
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383
+    ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GCN-NEXT: %zext:_(s64) = G_ZEXT %trunc(s16)
     ; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(s64)
     %var:_(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 966a481b6594d..bb7bc0447aea0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -238,13 +238,12 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
 ; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[12:13], 0x0
 ; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v1, -1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
-; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    ; implicit-def: $vgpr3
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index afffebea451a0..3bd3486ec261d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -350,10 +350,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX8-LABEL: s_fshl_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s3, s2, 7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
@@ -362,10 +364,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX9-LABEL: s_fshl_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s3, s2, 7
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
@@ -377,7 +381,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX10-NEXT:    s_and_b32 s3, s2, 7
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -389,7 +395,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX11-NEXT:    s_and_b32 s3, s2, 7
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -416,11 +424,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v3, 7, v2
-; GFX8-NEXT:    v_not_b32_e32 v2, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 1
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -429,11 +437,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 7, v2
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -441,11 +449,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX10-LABEL: v_fshl_i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_not_b32_e32 v3, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -454,12 +462,12 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX11-LABEL: v_fshl_i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_not_b32_e32 v3, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v2, v0
 ; GFX11-NEXT:    v_lshrrev_b16 v1, v3, v1
@@ -692,22 +700,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX8-NEXT:    s_and_b32 s6, s2, 7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s5, 7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s2, s4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX8-NEXT:    s_and_b32 s3, s4, 0xff
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_andn2_b32 s3, 7, s5
+; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_andn2_b32 s2, 7, s5
-; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
@@ -719,22 +731,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX9-NEXT:    s_and_b32 s6, s2, 7
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s5, 7
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_and_b32 s2, s4, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX9-NEXT:    s_and_b32 s3, s4, 0xff
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_andn2_b32 s3, 7, s5
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_andn2_b32 s2, 7, s5
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
@@ -745,21 +761,25 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX10-LABEL: s_fshl_v2i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX10-NEXT:    s_and_b32 s5, s2, 7
+; GFX10-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX10-NEXT:    s_and_b32 s6, s2, 7
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX10-NEXT:    s_and_b32 s5, s6, 7
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_andn2_b32 s6, 7, s6
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX10-NEXT:    s_and_b32 s6, s5, 7
-; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
-; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX10-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s5
+; GFX10-NEXT:    s_lshr_b32 s4, s4, s6
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX10-NEXT:    s_or_b32 s2, s3, s4
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -772,21 +792,25 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX11-LABEL: s_fshl_v2i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX11-NEXT:    s_and_b32 s5, s2, 7
+; GFX11-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX11-NEXT:    s_and_b32 s6, s2, 7
+; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX11-NEXT:    s_and_b32 s5, s6, 7
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-NEXT:    s_and_not1_b32 s6, 7, s6
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX11-NEXT:    s_and_b32 s6, s5, 7
-; GFX11-NEXT:    s_and_not1_b32 s5, 7, s5
-; GFX11-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX11-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX11-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_lshl_b32 s3, s3, s5
+; GFX11-NEXT:    s_lshr_b32 s4, s4, s6
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX11-NEXT:    s_or_b32 s2, s3, s4
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
@@ -837,20 +861,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT:    v_not_b32_e32 v2, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, 7, v5
-; GFX8-NEXT:    v_not_b32_e32 v2, v5
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v2, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v5
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -863,20 +887,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 1
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 7, v5
-; GFX9-NEXT:    v_not_b32_e32 v2, v5
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v2, v3
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v5
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -886,24 +910,24 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-LABEL: v_fshl_v2i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX10-NEXT:    v_not_b32_e32 v7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 7, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX10-NEXT:    v_lshrrev_b16 v4, 1, v4
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
-; GFX10-NEXT:    v_lshlrev_b16 v3, v3, v5
+; GFX10-NEXT:    v_lshrrev_b16 v3, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX10-NEXT:    v_lshlrev_b16 v4, v4, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT:    v_lshrrev_b16 v4, v6, v4
+; GFX10-NEXT:    v_lshrrev_b16 v3, v6, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v7, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -913,26 +937,26 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX11-LABEL: v_fshl_v2i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX11-NEXT:    v_not_b32_e32 v7, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_xor_b32_e32 v6, -1, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 7, v4
 ; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX11-NEXT:    v_lshrrev_b16 v4, 1, v4
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 1, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, v3, v5
+; GFX11-NEXT:    v_lshrrev_b16 v3, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX11-NEXT:    v_lshlrev_b16 v4, v4, v5
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b16 v4, v6, v4
+; GFX11-NEXT:    v_lshrrev_b16 v3, v6, v3
 ; GFX11-NEXT:    v_lshrrev_b16 v1, v7, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
@@ -1002,13 +1026,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX8-NEXT:    s_and_b32 s12, s2, 7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
@@ -1016,29 +1042,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s9, 7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s2, s6, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX8-NEXT:    s_and_b32 s3, s6, 0xff
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_andn2_b32 s3, 7, s9
+; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_andn2_b32 s2, 7, s9
-; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s2, s10, 7
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s3, s7, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s2, s4, s2
-; GFX8-NEXT:    s_and_b32 s4, s7, 0xff
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_andn2_b32 s4, 7, s10
+; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT:    s_andn2_b32 s3, 7, s10
-; GFX8-NEXT:    s_lshr_b32 s4, s4, 1
-; GFX8-NEXT:    s_lshr_b32 s3, s4, s3
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX8-NEXT:    s_or_b32 s2, s2, s3
 ; GFX8-NEXT:    s_and_b32 s3, s11, 7
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s3, s5, s3
-; GFX8-NEXT:    s_lshr_b32 s5, s8, 1
+; GFX8-NEXT:    s_andn2_b32 s5, 7, s11
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_lshr_b32 s4, s8, 1
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_lshr_b32 s4, s5, s4
+; GFX8-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
@@ -1055,13 +1087,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX9-NEXT:    s_and_b32 s12, s2, 7
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
@@ -1069,29 +1103,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s9, 7
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_and_b32 s2, s6, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX9-NEXT:    s_and_b32 s3, s6, 0xff
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_andn2_b32 s3, 7, s9
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_andn2_b32 s2, 7, s9
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_and_b32 s2, s10, 7
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_and_b32 s3, s7, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s2, s4, s2
-; GFX9-NEXT:    s_and_b32 s4, s7, 0xff
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_andn2_b32 s4, 7, s10
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX9-NEXT:    s_andn2_b32 s3, 7, s10
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 1
-; GFX9-NEXT:    s_lshr_b32 s3, s4, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX9-NEXT:    s_or_b32 s2, s2, s3
 ; GFX9-NEXT:    s_and_b32 s3, s11, 7
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshl_b32 s3, s5, s3
-; GFX9-NEXT:    s_lshr_b32 s5, s8, 1
+; GFX9-NEXT:    s_andn2_b32 s5, 7, s11
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_lshr_b32 s4, s8, 1
+; GFX9-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX9-NEXT:    s_lshr_b32 s4, s5, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX9-NEXT:    s_or_b32 s3, s3, s4
@@ -1108,48 +1148,56 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX10-NEXT:    s_and_b32 s11, s2, 7
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX10-NEXT:    s_and_b32 s12, s2, 7
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_andn2_b32 s12, 7, s2
+; GFX10-NEXT:    s_and_b32 s11, 0xffff, s11
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
-; GFX10-NEXT:    s_and_b32 s2, s6, 0xff
-; GFX10-NEXT:    s_and_b32 s6, s9, 7
-; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s12
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX10-NEXT:    s_lshr_b32 s2, s2, s9
+; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s11
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_or_b32 s1, s3, s2
-; GFX10-NEXT:    s_and_b32 s2, s7, 0xff
-; GFX10-NEXT:    s_and_b32 s3, s10, 7
-; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_andn2_b32 s6, 7, s10
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX10-NEXT:    s_and_b32 s1, s9, 7
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX10-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX10-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX10-NEXT:    s_lshr_b32 s3, s6, s9
+; GFX10-NEXT:    s_and_b32 s6, s10, 7
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s6
+; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX10-NEXT:    s_lshr_b32 s2, s2, s6
-; GFX10-NEXT:    s_and_b32 s4, s11, 7
-; GFX10-NEXT:    s_andn2_b32 s6, 7, s11
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s6
+; GFX10-NEXT:    s_andn2_b32 s6, 7, s10
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_and_b32 s7, s2, 7
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s7
 ; GFX10-NEXT:    s_lshr_b32 s7, s8, 1
-; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX10-NEXT:    s_lshr_b32 s5, s7, s6
-; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_lshl_b32 s5, s5, s6
+; GFX10-NEXT:    s_lshr_b32 s2, s7, s2
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_or_b32 s3, s4, s5
+; GFX10-NEXT:    s_or_b32 s2, s5, s2
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX10-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX10-NEXT:    s_lshl_b32 s1, s3, 16
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -1161,48 +1209,56 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX11-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX11-NEXT:    s_and_b32 s11, s2, 7
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX11-NEXT:    s_and_b32 s12, s2, 7
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_not1_b32 s12, 7, s2
+; GFX11-NEXT:    s_and_b32 s11, 0xffff, s11
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
-; GFX11-NEXT:    s_and_b32 s2, s6, 0xff
-; GFX11-NEXT:    s_and_b32 s6, s9, 7
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_and_not1_b32 s9, 7, s9
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s12
-; GFX11-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX11-NEXT:    s_lshr_b32 s2, s2, s9
+; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s11
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-NEXT:    s_and_b32 s2, s7, 0xff
-; GFX11-NEXT:    s_and_b32 s3, s10, 7
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_and_not1_b32 s6, 7, s10
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX11-NEXT:    s_and_b32 s1, s9, 7
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_and_not1_b32 s9, 7, s9
+; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX11-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX11-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX11-NEXT:    s_lshr_b32 s3, s6, s9
+; GFX11-NEXT:    s_and_b32 s6, s10, 7
+; GFX11-NEXT:    s_or_b32 s1, s1, s3
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s6
+; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX11-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX11-NEXT:    s_lshr_b32 s2, s2, s6
-; GFX11-NEXT:    s_and_b32 s4, s11, 7
-; GFX11-NEXT:    s_and_not1_b32 s6, 7, s11
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s6
+; GFX11-NEXT:    s_and_not1_b32 s6, 7, s10
+; GFX11-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_and_b32 s7, s2, 7
+; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s7
 ; GFX11-NEXT:    s_lshr_b32 s7, s8, 1
-; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX11-NEXT:    s_lshr_b32 s5, s7, s6
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_lshl_b32 s5, s5, s6
+; GFX11-NEXT:    s_lshr_b32 s2, s7, s2
+; GFX11-NEXT:    s_or_b32 s3, s3, s4
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-NEXT:    s_or_b32 s2, s5, s2
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX11-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s3, 16
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1271,37 +1327,38 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-LABEL: v_fshl_v4i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_not_b32_e32 v7, v2
-; GFX8-NEXT:    v_mov_b32_e32 v9, 1
+; GFX8-NEXT:    v_mov_b32_e32 v8, 1
+; GFX8-NEXT:    v_xor_b32_e32 v10, -1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v6, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v7, v7, v10
+; GFX8-NEXT:    v_lshrrev_b16_e32 v9, v10, v9
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 7, v5
-; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 7, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v7, v3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v9, v3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0xff
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 7
+; GFX8-NEXT:    v_mov_b32_e32 v9, -1
 ; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_xor_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 1, v8
+; GFX8-NEXT:    v_xor_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 1, v7
+; GFX8-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v7, v7, v8
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, v10, v7
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
@@ -1320,46 +1377,47 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-LABEL: v_fshl_v4i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_not_b32_e32 v7, v2
-; GFX9-NEXT:    v_mov_b32_e32 v9, 1
+; GFX9-NEXT:    v_mov_b32_e32 v8, 1
+; GFX9-NEXT:    v_xor_b32_e32 v10, -1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v0
-; GFX9-NEXT:    v_lshrrev_b16_e32 v7, v7, v10
+; GFX9-NEXT:    v_lshrrev_b16_e32 v9, v10, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX9-NEXT:    v_and_b32_e32 v7, 7, v5
-; GFX9-NEXT:    v_not_b32_e32 v5, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX9-NEXT:    v_and_b32_e32 v9, 7, v5
+; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v7, v3
+; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v9, v3
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0xff
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 7
+; GFX9-NEXT:    v_mov_b32_e32 v10, -1
 ; GFX9-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_xor_b32_sdwa v11, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT:    v_lshrrev_b16_e32 v10, 1, v10
+; GFX9-NEXT:    v_xor_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b16_e32 v9, 1, v9
+; GFX9-NEXT:    v_and_b32_e32 v11, 7, v11
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshrrev_b16_e32 v7, v7, v10
+; GFX9-NEXT:    v_lshrrev_b16_e32 v9, v11, v9
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
-; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v9
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v5
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v6, v8, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v6, v7, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
@@ -1368,41 +1426,42 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-LABEL: v_fshl_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX10-NEXT:    v_and_b32_e32 v9, 7, v2
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xff, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xff, v1
+; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v2
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v6
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_not_b32_e32 v12, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, v9, v0
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT:    v_lshrrev_b16 v9, 1, v11
-; GFX10-NEXT:    v_and_b32_e32 v11, 7, v12
-; GFX10-NEXT:    v_mov_b32_e32 v12, 0xff
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-NEXT:    v_lshlrev_b16 v3, v7, v3
-; GFX10-NEXT:    v_mov_b32_e32 v7, 7
-; GFX10-NEXT:    v_not_b32_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_not_b32_sdwa v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT:    v_not_b32_e32 v8, v2
-; GFX10-NEXT:    v_lshrrev_b16 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_sdwa v14, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v13, 7, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
+; GFX10-NEXT:    v_lshrrev_b16 v8, 1, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 7, v10
+; GFX10-NEXT:    v_lshlrev_b16 v3, v11, v3
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v11, -1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v1
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX10-NEXT:    v_mov_b32_e32 v13, 7
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_xor_b32_sdwa v10, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_xor_b32_sdwa v11, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b16 v7, 1, v7
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_sdwa v14, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v12
-; GFX10-NEXT:    v_lshrrev_b16 v10, 1, v10
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
-; GFX10-NEXT:    v_lshrrev_b16 v6, v11, v6
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b16 v12, 1, v12
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
+; GFX10-NEXT:    v_lshrrev_b16 v6, v6, v7
 ; GFX10-NEXT:    v_lshlrev_b16 v4, v14, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v13, v1
+; GFX10-NEXT:    v_lshrrev_b16 v1, v10, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v2, v2, v5
-; GFX10-NEXT:    v_lshrrev_b16 v5, v7, v10
-; GFX10-NEXT:    v_lshrrev_b16 v7, v8, v9
+; GFX10-NEXT:    v_lshrrev_b16 v5, v11, v12
+; GFX10-NEXT:    v_lshrrev_b16 v7, v9, v8
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 8
 ; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -1426,7 +1485,7 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_not_b32_e32 v13, v9
+; GFX11-NEXT:    v_xor_b32_e32 v13, -1, v9
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX11-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
@@ -1434,22 +1493,22 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX11-NEXT:    v_and_b32_e32 v13, 7, v13
 ; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX11-NEXT:    v_lshlrev_b16 v3, v9, v3
-; GFX11-NEXT:    v_not_b32_e32 v9, v10
+; GFX11-NEXT:    v_xor_b32_e32 v9, -1, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b16 v6, v13, v6
-; GFX11-NEXT:    v_not_b32_e32 v13, v11
+; GFX11-NEXT:    v_xor_b32_e32 v13, -1, v11
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX11-NEXT:    v_and_b32_e32 v12, 7, v2
-; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v10, 7, v10
-; GFX11-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX11-NEXT:    v_lshrrev_b16 v7, 1, v7
+; GFX11-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX11-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX11-NEXT:    v_and_b32_e32 v13, 7, v13
 ; GFX11-NEXT:    v_lshrrev_b16 v8, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT:    v_and_b32_e32 v13, 7, v13
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX11-NEXT:    v_lshlrev_b16 v4, v10, v4
 ; GFX11-NEXT:    v_lshrrev_b16 v6, v9, v7
@@ -5087,23 +5146,48 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 }
 
 define amdgpu_ps i64 @s_fshl_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) {
-; GCN-LABEL: s_fshl_i64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b64 s[6:7], s[4:5], 63
-; GCN-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], s6
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_fshl_i64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX6-NEXT:    s_not_b32 s4, s4
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX8-NEXT:    s_not_b32 s4, s4
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT:    s_not_b32 s4, s4
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT:    s_not_b32 s5, s4
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 63
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 63, s[4:5]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s6
-; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT:    s_not_b32 s5, s4
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -5181,8 +5265,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX6-NEXT:    v_not_b32_e32 v4, v4
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
+; GFX6-NEXT:    v_not_b32_e32 v4, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v5
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
@@ -5194,8 +5278,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
@@ -5207,8 +5291,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX9-NEXT:    v_not_b32_e32 v4, v4
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT:    v_not_b32_e32 v4, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
@@ -5362,36 +5446,36 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_and_b32_e32 v1, 63, v0
 ; GFX6-NEXT:    v_not_b32_e32 v0, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v1
+; GFX6-NEXT:    v_lshl_b64 v[1:2], s[0:1], v1
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
-; GFX6-NEXT:    v_lshr_b64 v[2:3], s[0:1], v2
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT:    v_lshr_b64 v[3:4], s[0:1], v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i64_ssv:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_and_b32_e32 v1, 63, v0
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v1, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v1, s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT:    v_lshrrev_b64 v[3:4], v0, s[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i64_ssv:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_and_b32_e32 v1, 63, v0
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v1, s[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v1, s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT:    v_lshrrev_b64 v[3:4], v0, s[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX9-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i64_ssv:
@@ -5429,10 +5513,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ; GFX6-LABEL: v_fshl_i64_svs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], s2
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], s3
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -5440,10 +5523,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ; GFX8-LABEL: v_fshl_i64_svs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s2, v[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -5451,10 +5533,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ; GFX9-LABEL: v_fshl_i64_svs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s2, v[0:1]
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -5462,10 +5543,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ; GFX10-LABEL: v_fshl_i64_svs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 63, s[2:3]
-; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], 63
+; GFX10-NEXT:    s_andn2_b32 s3, 63, s2
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -5473,13 +5553,12 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ; GFX11-LABEL: v_fshl_i64_svs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 63, s[2:3]
-; GFX11-NEXT:    s_and_b64 s[2:3], s[2:3], 63
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_and_not1_b32 s3, 63, s2
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5490,10 +5569,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg %amt) {
 ; GFX6-LABEL: v_fshl_i64_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s4
+; GFX6-NEXT:    s_and_b32 s3, s2, 63
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s3
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX6-NEXT:    s_not_b32 s2, s2
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -5501,10 +5580,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ;
 ; GFX8-LABEL: v_fshl_i64_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX8-NEXT:    s_and_b32 s3, s2, 63
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX8-NEXT:    s_not_b32 s2, s2
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -5512,10 +5591,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ;
 ; GFX9-LABEL: v_fshl_i64_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX9-NEXT:    s_and_b32 s3, s2, 63
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX9-NEXT:    s_not_b32 s2, s2
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -5523,10 +5602,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ;
 ; GFX10-LABEL: v_fshl_i64_vss:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX10-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    s_and_b32 s3, s2, 63
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX10-NEXT:    s_not_b32 s2, s2
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -5534,10 +5613,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ;
 ; GFX11-LABEL: v_fshl_i64_vss:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX11-NEXT:    s_and_not1_b64 s[2:3], 63, s[2:3]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX11-NEXT:    s_and_b32 s3, s2, 63
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX11-NEXT:    s_not_b32 s2, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -5553,80 +5632,70 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 define amdgpu_ps <2 x i64> @s_fshl_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) {
 ; GFX6-LABEL: s_fshl_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX6-NEXT:    s_not_b32 s8, s8
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX6-NEXT:    s_not_b32 s6, s10
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX8-NEXT:    s_not_b32 s8, s8
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX8-NEXT:    s_not_b32 s6, s10
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX9-NEXT:    s_not_b32 s8, s8
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX9-NEXT:    s_not_b32 s6, s10
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX10-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
+; GFX10-NEXT:    s_not_b32 s9, s8
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
-; GFX10-NEXT:    s_and_b64 s[8:9], s[10:11], 63
-; GFX10-NEXT:    s_andn2_b64 s[10:11], 63, s[10:11]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
+; GFX10-NEXT:    s_not_b32 s8, s10
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s9
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s8
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], 63, s[8:9]
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
+; GFX11-NEXT:    s_not_b32 s9, s8
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT:    s_and_b64 s[8:9], s[10:11], 63
-; GFX11-NEXT:    s_and_not1_b64 s[10:11], 63, s[10:11]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
+; GFX11-NEXT:    s_not_b32 s8, s10
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s9
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s8
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -5639,18 +5708,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], 1
+; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v9
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
-; GFX6-NEXT:    v_not_b32_e32 v8, v10
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], 1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
-; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v8
+; GFX6-NEXT:    v_not_b32_e32 v4, v10
+; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v7
@@ -5660,18 +5729,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
+; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
-; GFX8-NEXT:    v_not_b32_e32 v8, v10
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v8, v[6:7]
+; GFX8-NEXT:    v_not_b32_e32 v4, v10
+; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v7
@@ -5681,18 +5750,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
+; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT:    v_not_b32_e32 v8, v10
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v8, v[6:7]
+; GFX9-NEXT:    v_not_b32_e32 v4, v10
+; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v7
@@ -5750,231 +5819,236 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: s_fshl_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX6-NEXT:    s_sub_i32 s9, s10, 64
-; GFX6-NEXT:    s_sub_i32 s11, 64, s10
-; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX6-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX6-NEXT:    s_sub_i32 s11, s9, 64
+; GFX6-NEXT:    s_sub_i32 s14, 64, s9
+; GFX6-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX6-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[0:1], s14
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[2:3], s8
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[0:1], s8
+; GFX6-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_mov_b32 s12, 0
+; GFX6-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX6-NEXT:    s_mov_b32 s10, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX6-NEXT:    s_lshl_b32 s13, s6, 31
-; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX6-NEXT:    s_lshl_b32 s11, s6, 31
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX6-NEXT:    s_sub_i32 s12, s8, 64
-; GFX6-NEXT:    s_sub_i32 s10, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_andn2_b32 s6, 0x7f, s8
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX6-NEXT:    s_not_b32 s9, s8
+; GFX6-NEXT:    s_sub_i32 s14, s6, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s6
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[4:5], s9
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], s9
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX6-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX8-NEXT:    s_sub_i32 s9, s10, 64
-; GFX8-NEXT:    s_sub_i32 s11, 64, s10
-; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX8-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX8-NEXT:    s_sub_i32 s11, s9, 64
+; GFX8-NEXT:    s_sub_i32 s14, 64, s9
+; GFX8-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX8-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[0:1], s14
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[2:3], s8
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[0:1], s8
+; GFX8-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX8-NEXT:    s_mov_b32 s12, 0
+; GFX8-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX8-NEXT:    s_mov_b32 s10, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX8-NEXT:    s_lshl_b32 s13, s6, 31
-; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX8-NEXT:    s_lshl_b32 s11, s6, 31
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX8-NEXT:    s_sub_i32 s12, s8, 64
-; GFX8-NEXT:    s_sub_i32 s10, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_andn2_b32 s6, 0x7f, s8
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX8-NEXT:    s_not_b32 s9, s8
+; GFX8-NEXT:    s_sub_i32 s14, s6, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s6
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[4:5], s9
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[0:1], s9
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX8-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX9-NEXT:    s_sub_i32 s9, s10, 64
-; GFX9-NEXT:    s_sub_i32 s11, 64, s10
-; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX9-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX9-NEXT:    s_sub_i32 s11, s9, 64
+; GFX9-NEXT:    s_sub_i32 s14, 64, s9
+; GFX9-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX9-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[0:1], s14
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[2:3], s8
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[0:1], s8
+; GFX9-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_mov_b32 s12, 0
+; GFX9-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX9-NEXT:    s_mov_b32 s10, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX9-NEXT:    s_lshl_b32 s13, s6, 31
-; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX9-NEXT:    s_lshl_b32 s11, s6, 31
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX9-NEXT:    s_sub_i32 s12, s8, 64
-; GFX9-NEXT:    s_sub_i32 s10, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_andn2_b32 s6, 0x7f, s8
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX9-NEXT:    s_not_b32 s9, s8
+; GFX9-NEXT:    s_sub_i32 s14, s6, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s6
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[4:5], s9
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[0:1], s9
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX9-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX10-NEXT:    s_sub_i32 s9, s10, 64
-; GFX10-NEXT:    s_sub_i32 s11, 64, s10
-; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_mov_b32 s12, 0
-; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX10-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX10-NEXT:    s_mov_b32 s10, 0
+; GFX10-NEXT:    s_sub_i32 s11, s9, 64
+; GFX10-NEXT:    s_sub_i32 s12, 64, s9
+; GFX10-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
-; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[0:1], s12
+; GFX10-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[0:1], s8
+; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX10-NEXT:    s_cselect_b64 s[14:15], s[16:17], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX10-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX10-NEXT:    s_lshl_b32 s11, s6, 31
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX10-NEXT:    s_sub_i32 s14, s8, 64
-; GFX10-NEXT:    s_sub_i32 s9, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_andn2_b32 s6, 0x7f, s8
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_not_b32 s10, s8
+; GFX10-NEXT:    s_sub_i32 s12, s6, 64
+; GFX10-NEXT:    s_sub_i32 s8, 64, s6
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX10-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s10
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
-; GFX10-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], 0x7f, s[8:9]
-; GFX11-NEXT:    s_sub_i32 s9, s10, 64
-; GFX11-NEXT:    s_sub_i32 s11, 64, s10
-; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_mov_b32 s12, 0
-; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX11-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX11-NEXT:    s_mov_b32 s10, 0
+; GFX11-NEXT:    s_sub_i32 s11, s9, 64
+; GFX11-NEXT:    s_sub_i32 s12, 64, s9
+; GFX11-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
-; GFX11-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[12:13], s[0:1], s12
+; GFX11-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[0:1], s8
+; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX11-NEXT:    s_cselect_b64 s[14:15], s[16:17], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX11-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX11-NEXT:    s_lshl_b32 s11, s6, 31
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX11-NEXT:    s_sub_i32 s14, s8, 64
-; GFX11-NEXT:    s_sub_i32 s9, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX11-NEXT:    s_and_not1_b32 s6, 0x7f, s8
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX11-NEXT:    s_not_b32 s10, s8
+; GFX11-NEXT:    s_sub_i32 s12, s6, 64
+; GFX11-NEXT:    s_sub_i32 s8, 64, s6
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX11-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s10
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
 ; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
-; GFX11-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
+; GFX11-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -5985,143 +6059,143 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshl_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v14
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v14
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v8
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v14
-; GFX6-NEXT:    v_lshl_b64 v[12:13], v[0:1], v14
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 64, v15
+; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v15
+; GFX6-NEXT:    v_lshr_b64 v[9:10], v[0:1], v9
+; GFX6-NEXT:    v_lshl_b64 v[11:12], v[2:3], v15
+; GFX6-NEXT:    v_lshl_b64 v[13:14], v[0:1], v15
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v16
-; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX6-NEXT:    v_or_b32_e32 v10, v10, v12
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v13, v1, v3, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], 1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 31, v6
+; GFX6-NEXT:    v_not_b32_e32 v4, v8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[6:7], 1
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v15
-; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v15
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], v15
+; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v4
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v14
+; GFX6-NEXT:    v_subrev_i32_e32 v15, vcc, 64, v14
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], v14
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], v6
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], v15
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v14
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], v14
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v15
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v12, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v13, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v14
-; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v14
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v14, v[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[12:13], v14, v[0:1]
+; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, 64, v15
+; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v15
+; GFX8-NEXT:    v_lshrrev_b64 v[9:10], v9, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[13:14], v15, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX8-NEXT:    v_or_b32_e32 v10, v10, v12
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 31, v6
+; GFX8-NEXT:    v_not_b32_e32 v4, v8
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[6:7]
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v15
-; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v15
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v15, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v4
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v14
+; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, 64, v14
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v14, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v14, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v15, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v12, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v13, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v14
-; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v14
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v14, v[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v14, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v9, 64, v15
+; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v15
+; GFX9-NEXT:    v_lshrrev_b64 v[9:10], v9, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[13:14], v15, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v16, v[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX9-NEXT:    v_or_b32_e32 v10, v10, v12
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v1, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v1, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v8, v3, vcc
+; GFX9-NEXT:    v_not_b32_e32 v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v9, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[6:7]
+; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v4
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 31, v1
-; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v15
-; GFX9-NEXT:    v_subrev_u32_e32 v14, 64, v15
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v15, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v14
+; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v14, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v14, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v15, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX9-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, v12, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v13, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6129,15 +6203,15 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v18, 0x7f, v8
-; GFX10-NEXT:    v_not_b32_e32 v8, v8
+; GFX10-NEXT:    v_not_b32_e32 v10, v8
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[12:13], 1, v[6:7]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
-; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX10-NEXT:    v_sub_nc_u32_e32 v11, 64, v18
+; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v10
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 31, v5
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
-; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v11, v[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v18, v[0:1]
 ; GFX10-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
@@ -6175,43 +6249,43 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX11-LABEL: v_fshl_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0x7f, v8
-; GFX11-NEXT:    v_not_b32_e32 v8, v8
+; GFX11-NEXT:    v_not_b32_e32 v10, v8
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 1, v[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
-; GFX11-NEXT:    v_lshl_or_b32 v5, v6, 31, v5
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v18, v[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 64, v18
+; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v10
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
+; GFX11-NEXT:    v_lshl_or_b32 v5, v6, 31, v5
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v11, v[0:1]
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v18, v[0:1]
 ; GFX11-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v19
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
 ; GFX11-NEXT:    v_or_b32_e32 v10, v10, v8
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
 ; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v16, v[12:13]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX11-NEXT:    v_or_b32_e32 v11, v11, v9
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v19
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v8, v[12:13]
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
 ; GFX11-NEXT:    v_or_b32_e32 v14, v14, v16
 ; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v19, v[12:13]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v18
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v6 :: v_dual_cndmask_b32 v7, 0, v7
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s0
@@ -6229,173 +6303,173 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshl_i128_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX6-NEXT:    v_not_b32_e32 v0, v0
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v0
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v6
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v0
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v6
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v6
-; GFX6-NEXT:    v_lshl_b64 v[4:5], s[0:1], v6
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 64, v7
+; GFX6-NEXT:    v_lshr_b64 v[1:2], s[0:1], v1
+; GFX6-NEXT:    v_lshl_b64 v[3:4], s[2:3], v7
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v7
+; GFX6-NEXT:    v_lshl_b64 v[5:6], s[0:1], v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v8
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX6-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX6-NEXT:    v_lshl_b64 v[1:2], s[0:1], v8
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX6-NEXT:    v_not_b32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s8, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_mov_b32_e32 v3, s3
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v3, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX6-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX6-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v7
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v7
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v10
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v10
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v7
+; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v10
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[2:3], v11
-; GFX6-NEXT:    v_lshr_b64 v[4:5], s[2:3], v7
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX6-NEXT:    v_lshr_b64 v[4:5], s[2:3], v10
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i128_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v0
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v6
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 64, v7
+; GFX8-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v7
+; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v7, s[0:1]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX8-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, s[0:1]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    s_mov_b32 s8, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX8-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX8-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v7
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v7, s[0:1]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v10
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v10, s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v10
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v7, s[2:3]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v10, s[2:3]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i128_ssv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v6
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v6
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v7, s[0:1]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX9-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, s[0:1]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX9-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v7, s[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v10, s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v7
+; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v10
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v7, s[2:3]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v10, s[2:3]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX9-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i128_ssv:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX10-NEXT:    v_not_b32_e32 v0, v0
+; GFX10-NEXT:    v_not_b32_e32 v2, v0
 ; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; GFX10-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT:    v_and_b32_e32 v13, 0x7f, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 64, v12
+; GFX10-NEXT:    v_and_b32_e32 v13, 0x7f, v2
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
 ; GFX10-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v3, s[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, s[8:9]
@@ -6434,58 +6508,52 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX11-LABEL: v_fshl_i128_ssv:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX11-NEXT:    v_not_b32_e32 v0, v0
+; GFX11-NEXT:    v_not_b32_e32 v2, v0
 ; GFX11-NEXT:    s_mov_b32 s8, 0
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; GFX11-NEXT:    s_lshl_b32 s9, s6, 31
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v12, s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0x7f, v2
 ; GFX11-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
-; GFX11-NEXT:    v_and_b32_e32 v13, 0x7f, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, 0, v4 :: v_dual_cndmask_b32 v5, 0, v5
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 64, v12
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
 ; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v13, s[8:9]
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v3, s[0:1]
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
+; GFX11-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, 64, v13
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v1
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, 64, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v7, v7, v9
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v0, s[6:7]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v0, s[6:7]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v8, s2, s4
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, v10, s3, s4
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s8, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s9, s1
-; GFX11-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX11-NEXT:    v_or_b32_e32 v2, v6, v2
 ; GFX11-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6495,43 +6563,43 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshl_i128_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX6-NEXT:    s_sub_i32 s5, s6, 64
-; GFX6-NEXT:    s_sub_i32 s7, 64, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX6-NEXT:    s_sub_i32 s12, s5, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s5
+; GFX6-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX6-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], s4
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 31, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    s_sub_i32 s0, s4, 64
-; GFX6-NEXT:    s_sub_i32 s1, 64, s4
+; GFX6-NEXT:    s_sub_i32 s1, s0, 64
+; GFX6-NEXT:    s_sub_i32 s4, 64, s0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s4
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
-; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s4
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s0
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s0
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s1
 ; GFX6-NEXT:    s_and_b32 s0, 1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s6
+; GFX6-NEXT:    s_and_b32 s0, 1, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6539,51 +6607,51 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s9, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i128_svs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX8-NEXT:    s_sub_i32 s5, s6, 64
-; GFX8-NEXT:    s_sub_i32 s7, 64, s6
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX8-NEXT:    s_sub_i32 s12, s5, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s5
+; GFX8-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX8-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], s4
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 31, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    s_sub_i32 s0, s4, 64
-; GFX8-NEXT:    s_sub_i32 s1, 64, s4
+; GFX8-NEXT:    s_sub_i32 s1, s0, 64
+; GFX8-NEXT:    s_sub_i32 s4, 64, s0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s5
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s6
+; GFX8-NEXT:    s_and_b32 s0, 1, s8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6591,50 +6659,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s9, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i128_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX9-NEXT:    s_sub_i32 s5, s6, 64
-; GFX9-NEXT:    s_sub_i32 s7, 64, s6
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX9-NEXT:    s_sub_i32 s12, s5, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s5
+; GFX9-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX9-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], s4
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    s_sub_i32 s0, s4, 64
-; GFX9-NEXT:    s_sub_i32 s1, 64, s4
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_sub_i32 s1, s0, 64
+; GFX9-NEXT:    s_sub_i32 s4, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s5
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s6
+; GFX9-NEXT:    s_and_b32 s0, 1, s8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6642,50 +6710,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s9, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i128_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT:    s_sub_i32 s5, s6, 64
-; GFX10-NEXT:    s_sub_i32 s7, 64, s6
-; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    s_and_b32 s5, s4, 0x7f
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    s_sub_i32 s12, s5, 64
+; GFX10-NEXT:    s_sub_i32 s6, 64, s5
+; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
-; GFX10-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
-; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s4
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s4
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s0, 64, s4
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT:    s_sub_i32 s0, s4, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_andn2_b32 s0, 0x7f, s4
+; GFX10-NEXT:    s_sub_i32 s1, 64, s0
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX10-NEXT:    s_sub_i32 s1, s0, 64
+; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
+; GFX10-NEXT:    s_and_b32 s1, 1, s4
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
 ; GFX10-NEXT:    s_and_b32 s0, 1, s5
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
@@ -6695,62 +6763,62 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX10-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshl_i128_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT:    s_sub_i32 s5, s6, 64
-; GFX11-NEXT:    s_sub_i32 s7, 64, s6
-; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX11-NEXT:    s_and_b32 s5, s4, 0x7f
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    s_sub_i32 s12, s5, 64
+; GFX11-NEXT:    s_sub_i32 s6, 64, s5
+; GFX11-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
-; GFX11-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
-; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s4
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s4
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s0, 64, s4
+; GFX11-NEXT:    s_and_not1_b32 s0, 0x7f, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT:    s_sub_i32 s0, s4, 64
-; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_sub_i32 s1, 64, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX11-NEXT:    s_sub_i32 s1, s0, 64
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX11-NEXT:    s_and_b32 s0, 1, s1
+; GFX11-NEXT:    s_and_b32 s1, 1, s4
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
 ; GFX11-NEXT:    s_and_b32 s0, 1, s5
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6760,25 +6828,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshl_i128_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX6-NEXT:    s_sub_i32 s5, s6, 64
-; GFX6-NEXT:    s_sub_i32 s7, 64, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX6-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX6-NEXT:    s_sub_i32 s7, s5, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s5
+; GFX6-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s8, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s7
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s6
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s5
-; GFX6-NEXT:    s_and_b32 s5, 1, s9
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s8
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s5
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s7
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s6
+; GFX6-NEXT:    s_lshl_b32 s7, s2, 31
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s5
+; GFX6-NEXT:    s_and_b32 s5, 1, s9
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX6-NEXT:    s_not_b32 s6, s4
+; GFX6-NEXT:    s_andn2_b32 s4, 0x7f, s4
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX6-NEXT:    s_and_b32 s5, 1, s10
-; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_sub_i32 s10, s4, 64
 ; GFX6-NEXT:    s_sub_i32 s8, 64, s4
@@ -6793,19 +6862,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6814,25 +6883,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX8-LABEL: v_fshl_i128_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX8-NEXT:    s_sub_i32 s5, s6, 64
-; GFX8-NEXT:    s_sub_i32 s7, 64, s6
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX8-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX8-NEXT:    s_sub_i32 s7, s5, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s5
+; GFX8-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX8-NEXT:    s_mov_b32 s8, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX8-NEXT:    s_mov_b32 s6, 0
 ; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX8-NEXT:    s_and_b32 s5, 1, s9
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX8-NEXT:    s_lshl_b32 s7, s2, 31
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
+; GFX8-NEXT:    s_and_b32 s5, 1, s9
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT:    s_not_b32 s6, s4
+; GFX8-NEXT:    s_andn2_b32 s4, 0x7f, s4
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX8-NEXT:    s_and_b32 s5, 1, s10
-; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_sub_i32 s10, s4, 64
 ; GFX8-NEXT:    s_sub_i32 s8, 64, s4
@@ -6847,19 +6917,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6868,25 +6938,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX9-LABEL: v_fshl_i128_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX9-NEXT:    s_sub_i32 s5, s6, 64
-; GFX9-NEXT:    s_sub_i32 s7, 64, s6
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX9-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX9-NEXT:    s_sub_i32 s7, s5, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s5
+; GFX9-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX9-NEXT:    s_mov_b32 s8, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX9-NEXT:    s_and_b32 s5, 1, s9
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX9-NEXT:    s_lshl_b32 s7, s2, 31
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
+; GFX9-NEXT:    s_and_b32 s5, 1, s9
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT:    s_not_b32 s6, s4
+; GFX9-NEXT:    s_andn2_b32 s4, 0x7f, s4
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX9-NEXT:    s_and_b32 s5, 1, s10
-; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_sub_i32 s10, s4, 64
 ; GFX9-NEXT:    s_sub_i32 s8, 64, s4
@@ -6901,19 +6972,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6922,53 +6993,54 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX10-LABEL: v_fshl_i128_vss:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT:    s_sub_i32 s5, s6, 64
-; GFX10-NEXT:    s_sub_i32 s7, 64, s6
-; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX10-NEXT:    s_sub_i32 s6, s5, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s5
+; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX10-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX10-NEXT:    s_and_b32 s6, 1, s8
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s6, v[0:1]
 ; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX10-NEXT:    s_lshl_b32 s7, s2, 31
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT:    s_and_b32 s5, 1, s9
+; GFX10-NEXT:    s_and_b32 s5, 1, s8
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_sub_i32 s10, s4, 64
-; GFX10-NEXT:    s_sub_i32 s8, 64, s4
+; GFX10-NEXT:    s_andn2_b32 s6, 0x7f, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    s_and_b32 s5, 1, s9
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT:    s_not_b32 s8, s4
+; GFX10-NEXT:    s_sub_i32 s10, s6, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s8
+; GFX10-NEXT:    s_lshl_b64 s[6:7], s[2:3], s7
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], 0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -6976,50 +7048,52 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX11-LABEL: v_fshl_i128_vss:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT:    s_sub_i32 s5, s6, 64
-; GFX11-NEXT:    s_sub_i32 s7, 64, s6
-; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX11-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s6, s5, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s5
+; GFX11-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
 ; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX11-NEXT:    s_and_b32 s6, 1, s8
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s6, v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX11-NEXT:    s_lshl_b32 s7, s2, 31
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT:    s_and_b32 s5, 1, s9
+; GFX11-NEXT:    s_and_b32 s5, 1, s8
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_sub_i32 s10, s4, 64
-; GFX11-NEXT:    s_sub_i32 s8, 64, s4
+; GFX11-NEXT:    s_and_not1_b32 s6, 0x7f, s4
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX11-NEXT:    s_and_b32 s5, 1, s9
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT:    s_not_b32 s8, s4
+; GFX11-NEXT:    s_sub_i32 s10, s6, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
 ; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], s8
+; GFX11-NEXT:    s_lshl_b64 s[6:7], s[2:3], s7
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[8:9], 0
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -7152,40 +7226,41 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshl_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX6-NEXT:    s_sub_i32 s17, s18, 64
-; GFX6-NEXT:    s_sub_i32 s19, 64, s18
-; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX6-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX6-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX6-NEXT:    s_sub_i32 s19, s17, 64
+; GFX6-NEXT:    s_sub_i32 s21, 64, s17
+; GFX6-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
-; GFX6-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX6-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX6-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX6-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[24:25], s[0:1], s21
+; GFX6-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[0:1], s16
+; GFX6-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
 ; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_mov_b32 s22, 0
+; GFX6-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_mov_b32 s18, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX6-NEXT:    s_lshl_b32 s23, s10, 31
-; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX6-NEXT:    s_lshl_b32 s19, s10, 31
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX6-NEXT:    s_sub_i32 s23, s16, 64
-; GFX6-NEXT:    s_sub_i32 s18, 64, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX6-NEXT:    s_andn2_b32 s10, 0x7f, s16
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX6-NEXT:    s_not_b32 s17, s16
+; GFX6-NEXT:    s_sub_i32 s19, s10, 64
+; GFX6-NEXT:    s_sub_i32 s21, 64, s10
+; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
-; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s17
+; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s17
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[8:9], s21
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s19
 ; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
@@ -7193,86 +7268,88 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX6-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
-; GFX6-NEXT:    s_sub_i32 s11, s8, 64
-; GFX6-NEXT:    s_sub_i32 s9, 64, s8
+; GFX6-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX6-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s19, s8, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
-; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[6:7], s20
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[4:5], s20
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX6-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX6-NEXT:    s_lshl_b32 s23, s14, 31
-; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX6-NEXT:    s_sub_i32 s18, s10, 64
-; GFX6-NEXT:    s_sub_i32 s14, 64, s10
-; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX6-NEXT:    s_lshl_b32 s19, s14, 31
+; GFX6-NEXT:    s_andn2_b32 s12, 0x7f, s20
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[14:15], 1
+; GFX6-NEXT:    s_not_b32 s14, s20
+; GFX6-NEXT:    s_sub_i32 s18, s12, 64
+; GFX6-NEXT:    s_sub_i32 s16, 64, s12
+; GFX6-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
+; GFX6-NEXT:    s_lshr_b64 s[12:13], s[10:11], s14
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[4:5], s14
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[10:11], s16
+; GFX6-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[14:15], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
+; GFX6-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX8-NEXT:    s_sub_i32 s17, s18, 64
-; GFX8-NEXT:    s_sub_i32 s19, 64, s18
-; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX8-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX8-NEXT:    s_sub_i32 s19, s17, 64
+; GFX8-NEXT:    s_sub_i32 s21, 64, s17
+; GFX8-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
-; GFX8-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX8-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX8-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX8-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[24:25], s[0:1], s21
+; GFX8-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[0:1], s16
+; GFX8-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
 ; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_mov_b32 s22, 0
+; GFX8-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX8-NEXT:    s_mov_b32 s18, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX8-NEXT:    s_lshl_b32 s23, s10, 31
-; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX8-NEXT:    s_lshl_b32 s19, s10, 31
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX8-NEXT:    s_sub_i32 s23, s16, 64
-; GFX8-NEXT:    s_sub_i32 s18, 64, s16
-; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX8-NEXT:    s_andn2_b32 s10, 0x7f, s16
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX8-NEXT:    s_not_b32 s17, s16
+; GFX8-NEXT:    s_sub_i32 s19, s10, 64
+; GFX8-NEXT:    s_sub_i32 s21, 64, s10
+; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
-; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s17
+; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s17
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[8:9], s21
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s19
 ; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
@@ -7280,86 +7357,88 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX8-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
-; GFX8-NEXT:    s_sub_i32 s11, s8, 64
-; GFX8-NEXT:    s_sub_i32 s9, 64, s8
+; GFX8-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX8-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s19, s8, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
-; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[6:7], s20
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[4:5], s20
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX8-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX8-NEXT:    s_lshl_b32 s23, s14, 31
-; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX8-NEXT:    s_sub_i32 s18, s10, 64
-; GFX8-NEXT:    s_sub_i32 s14, 64, s10
-; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX8-NEXT:    s_lshl_b32 s19, s14, 31
+; GFX8-NEXT:    s_andn2_b32 s12, 0x7f, s20
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[14:15], 1
+; GFX8-NEXT:    s_not_b32 s14, s20
+; GFX8-NEXT:    s_sub_i32 s18, s12, 64
+; GFX8-NEXT:    s_sub_i32 s16, 64, s12
+; GFX8-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
+; GFX8-NEXT:    s_lshr_b64 s[12:13], s[10:11], s14
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[4:5], s14
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[10:11], s16
+; GFX8-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[14:15], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
+; GFX8-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX9-NEXT:    s_sub_i32 s17, s18, 64
-; GFX9-NEXT:    s_sub_i32 s19, 64, s18
-; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX9-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX9-NEXT:    s_sub_i32 s19, s17, 64
+; GFX9-NEXT:    s_sub_i32 s21, 64, s17
+; GFX9-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
-; GFX9-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX9-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX9-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX9-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[24:25], s[0:1], s21
+; GFX9-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[0:1], s16
+; GFX9-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
 ; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_mov_b32 s22, 0
+; GFX9-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_mov_b32 s18, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX9-NEXT:    s_lshl_b32 s23, s10, 31
-; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX9-NEXT:    s_lshl_b32 s19, s10, 31
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX9-NEXT:    s_sub_i32 s23, s16, 64
-; GFX9-NEXT:    s_sub_i32 s18, 64, s16
-; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX9-NEXT:    s_andn2_b32 s10, 0x7f, s16
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX9-NEXT:    s_not_b32 s17, s16
+; GFX9-NEXT:    s_sub_i32 s19, s10, 64
+; GFX9-NEXT:    s_sub_i32 s21, 64, s10
+; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
-; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s17
+; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s17
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[8:9], s21
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s19
 ; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
@@ -7367,222 +7446,227 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX9-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
-; GFX9-NEXT:    s_sub_i32 s11, s8, 64
-; GFX9-NEXT:    s_sub_i32 s9, 64, s8
+; GFX9-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX9-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s19, s8, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[6:7], s20
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[4:5], s20
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX9-NEXT:    s_lshl_b32 s23, s14, 31
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX9-NEXT:    s_sub_i32 s18, s10, 64
-; GFX9-NEXT:    s_sub_i32 s14, 64, s10
-; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX9-NEXT:    s_lshl_b32 s19, s14, 31
+; GFX9-NEXT:    s_andn2_b32 s12, 0x7f, s20
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[14:15], 1
+; GFX9-NEXT:    s_not_b32 s14, s20
+; GFX9-NEXT:    s_sub_i32 s18, s12, 64
+; GFX9-NEXT:    s_sub_i32 s16, 64, s12
+; GFX9-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[10:11], s14
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[4:5], s14
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[10:11], s16
+; GFX9-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[14:15], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
+; GFX9-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX10-NEXT:    s_sub_i32 s17, s18, 64
-; GFX10-NEXT:    s_sub_i32 s19, 64, s18
-; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX10-NEXT:    s_mov_b32 s22, 0
-; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX10-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX10-NEXT:    s_mov_b32 s18, 0
+; GFX10-NEXT:    s_sub_i32 s19, s17, 64
+; GFX10-NEXT:    s_sub_i32 s21, 64, s17
+; GFX10-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
-; GFX10-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
-; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX10-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s21
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[2:3], s16
+; GFX10-NEXT:    s_lshl_b64 s[26:27], s[0:1], s16
+; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
 ; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX10-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX10-NEXT:    s_lshl_b32 s19, s10, 31
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s23, s16, 64
-; GFX10-NEXT:    s_sub_i32 s17, 64, s16
-; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX10-NEXT:    s_andn2_b32 s10, 0x7f, s16
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX10-NEXT:    s_not_b32 s19, s16
+; GFX10-NEXT:    s_sub_i32 s21, s10, 64
+; GFX10-NEXT:    s_sub_i32 s16, 64, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
-; GFX10-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s19
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
+; GFX10-NEXT:    s_lshr_b64 s[22:23], s[8:9], s19
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s21
 ; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
-; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
+; GFX10-NEXT:    s_and_b32 s10, s20, 0x7f
+; GFX10-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX10-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s11, s8, 64
-; GFX10-NEXT:    s_sub_i32 s9, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_sub_i32 s19, s10, 64
+; GFX10-NEXT:    s_sub_i32 s8, 64, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
+; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s20
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[4:5], s20
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX10-NEXT:    s_lshl_b32 s23, s14, 31
-; GFX10-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s18, s10, 64
-; GFX10-NEXT:    s_sub_i32 s11, 64, s10
-; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX10-NEXT:    s_lshl_b32 s19, s14, 31
+; GFX10-NEXT:    s_andn2_b32 s12, 0x7f, s20
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
+; GFX10-NEXT:    s_not_b32 s16, s20
+; GFX10-NEXT:    s_sub_i32 s18, s12, 64
+; GFX10-NEXT:    s_sub_i32 s14, 64, s12
+; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[4:5], s10
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[12:13], s11
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[12:13], s10
-; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX10-NEXT:    s_lshr_b64 s[12:13], s[12:13], s18
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[4:5], s16
+; GFX10-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[12:13], s[14:15], s[12:13]
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[12:13]
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
+; GFX10-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[16:17], 0x7f, s[16:17]
-; GFX11-NEXT:    s_sub_i32 s17, s18, 64
-; GFX11-NEXT:    s_sub_i32 s19, 64, s18
-; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX11-NEXT:    s_mov_b32 s22, 0
-; GFX11-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX11-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX11-NEXT:    s_mov_b32 s18, 0
+; GFX11-NEXT:    s_sub_i32 s19, s17, 64
+; GFX11-NEXT:    s_sub_i32 s21, 64, s17
+; GFX11-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
-; GFX11-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
-; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX11-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX11-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s21
+; GFX11-NEXT:    s_lshl_b64 s[24:25], s[2:3], s16
+; GFX11-NEXT:    s_lshl_b64 s[26:27], s[0:1], s16
+; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX11-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX11-NEXT:    s_lshl_b32 s19, s10, 31
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s23, s16, 64
-; GFX11-NEXT:    s_sub_i32 s17, 64, s16
-; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX11-NEXT:    s_and_not1_b32 s10, 0x7f, s16
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX11-NEXT:    s_not_b32 s19, s16
+; GFX11-NEXT:    s_sub_i32 s21, s10, 64
+; GFX11-NEXT:    s_sub_i32 s16, 64, s10
+; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
-; GFX11-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s19
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
+; GFX11-NEXT:    s_lshr_b64 s[22:23], s[8:9], s19
+; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s21
 ; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
-; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
+; GFX11-NEXT:    s_and_b32 s10, s20, 0x7f
+; GFX11-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX11-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s11, s8, 64
-; GFX11-NEXT:    s_sub_i32 s9, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX11-NEXT:    s_sub_i32 s19, s10, 64
+; GFX11-NEXT:    s_sub_i32 s8, 64, s10
+; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX11-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
+; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s20
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[4:5], s20
+; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX11-NEXT:    s_lshl_b32 s23, s14, 31
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s18, s10, 64
-; GFX11-NEXT:    s_sub_i32 s11, 64, s10
-; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX11-NEXT:    s_lshl_b32 s19, s14, 31
+; GFX11-NEXT:    s_and_not1_b32 s12, 0x7f, s20
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
+; GFX11-NEXT:    s_not_b32 s16, s20
+; GFX11-NEXT:    s_sub_i32 s18, s12, 64
+; GFX11-NEXT:    s_sub_i32 s14, 64, s12
+; GFX11-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[4:5], s10
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[12:13], s11
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[12:13], s10
-; GFX11-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[12:13], s18
+; GFX11-NEXT:    s_lshr_b64 s[12:13], s[4:5], s16
+; GFX11-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
+; GFX11-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
 ; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[12:13], s[14:15], s[12:13]
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
 ; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[12:13]
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
 ; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
+; GFX11-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
   ret <2 x i128> %result
@@ -7592,56 +7676,54 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-LABEL: v_fshl_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX6-NEXT:    v_not_b32_e32 v16, v16
-; GFX6-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v23
-; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v23
-; GFX6-NEXT:    v_lshr_b64 v[16:17], v[0:1], v16
-; GFX6-NEXT:    v_lshl_b64 v[18:19], v[2:3], v23
-; GFX6-NEXT:    v_lshl_b64 v[21:22], v[0:1], v23
+; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v16
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 64, v19
+; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v19
+; GFX6-NEXT:    v_lshr_b64 v[17:18], v[0:1], v17
+; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v19
+; GFX6-NEXT:    v_lshl_b64 v[23:24], v[0:1], v19
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v25
-; GFX6-NEXT:    v_or_b32_e32 v16, v16, v18
-; GFX6-NEXT:    v_or_b32_e32 v17, v17, v19
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX6-NEXT:    v_or_b32_e32 v17, v17, v21
+; GFX6-NEXT:    v_or_b32_e32 v18, v18, v22
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT:    v_cndmask_b32_e32 v21, 0, v23, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v22, 0, v24, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v17, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v18, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], 1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 31, v10
+; GFX6-NEXT:    v_not_b32_e32 v8, v16
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[10:11], 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v24
-; GFX6-NEXT:    v_subrev_i32_e32 v23, vcc, 64, v24
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v24
+; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v8
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v23
+; GFX6-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v23
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v23
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v10
-; GFX6-NEXT:    v_lshr_b64 v[16:17], v[2:3], v24
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v23
+; GFX6-NEXT:    v_lshr_b64 v[16:17], v[2:3], v23
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v24
 ; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX6-NEXT:    v_or_b32_e32 v0, v18, v0
-; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX6-NEXT:    v_not_b32_e32 v8, v20
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX6-NEXT:    v_or_b32_e32 v2, v18, v2
+; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX6-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v18
-; GFX6-NEXT:    v_subrev_i32_e32 v20, vcc, 64, v18
+; GFX6-NEXT:    v_or_b32_e32 v3, v19, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v19, vcc, 64, v18
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v8
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v18
 ; GFX6-NEXT:    v_lshl_b64 v[16:17], v[4:5], v18
-; GFX6-NEXT:    v_lshl_b64 v[4:5], v[4:5], v20
+; GFX6-NEXT:    v_lshl_b64 v[4:5], v[4:5], v19
 ; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
@@ -7651,88 +7733,88 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
 ; GFX6-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v19, v5, v7, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], 1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 31, v14
+; GFX6-NEXT:    v_not_b32_e32 v8, v20
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[14:15], 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v19
-; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v19
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v19
+; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v14
+; GFX6-NEXT:    v_subrev_i32_e32 v15, vcc, 64, v14
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v14
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v10
-; GFX6-NEXT:    v_lshr_b64 v[12:13], v[6:7], v19
-; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v14
+; GFX6-NEXT:    v_lshr_b64 v[12:13], v[6:7], v14
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v15
 ; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX6-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v22, v1
 ; GFX6-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX6-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX6-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX6-NEXT:    v_or_b32_e32 v7, v19, v7
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX8-NEXT:    v_not_b32_e32 v16, v16
-; GFX8-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v23
-; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v23
-; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v16, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[18:19], v23, v[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v23, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v16
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 64, v19
+; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v19
+; GFX8-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[23:24], v19, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v16, v16, v18
-; GFX8-NEXT:    v_or_b32_e32 v17, v17, v19
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX8-NEXT:    v_or_b32_e32 v17, v17, v21
+; GFX8-NEXT:    v_or_b32_e32 v18, v18, v22
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, 0, v23, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, 0, v24, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v18, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[8:9]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 31, v10
+; GFX8-NEXT:    v_not_b32_e32 v8, v16
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[10:11]
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v24
-; GFX8-NEXT:    v_subrev_u32_e32 v23, vcc, 64, v24
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v8
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v23
+; GFX8-NEXT:    v_subrev_u32_e32 v24, vcc, 64, v23
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v23, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v24, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v23, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v23, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v24, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v0, v18, v0
-; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX8-NEXT:    v_not_b32_e32 v8, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX8-NEXT:    v_or_b32_e32 v2, v18, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX8-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v18
-; GFX8-NEXT:    v_subrev_u32_e32 v20, vcc, 64, v18
+; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, 64, v18
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
 ; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v18, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v20, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v19, v[4:5]
 ; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
@@ -7742,87 +7824,87 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v5, v7, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 31, v14
+; GFX8-NEXT:    v_not_b32_e32 v8, v20
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v19
-; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v19
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v19, v[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v14
+; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, 64, v14
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v14, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v14, v[6:7]
+; GFX8-NEXT:    v_lshrrev_b64 v[12:13], v14, v[6:7]
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v15, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX8-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v22, v1
 ; GFX8-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX8-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, v19, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX9-NEXT:    v_not_b32_e32 v16, v16
-; GFX9-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX9-NEXT:    v_sub_u32_e32 v16, 64, v23
-; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v23
-; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v16, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[18:19], v23, v[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v23, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v16
+; GFX9-NEXT:    v_sub_u32_e32 v17, 64, v19
+; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v19
+; GFX9-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[23:24], v19, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v25, v[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v16, v16, v18
-; GFX9-NEXT:    v_or_b32_e32 v17, v17, v19
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v1, v17, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
+; GFX9-NEXT:    v_or_b32_e32 v17, v17, v21
+; GFX9-NEXT:    v_or_b32_e32 v18, v18, v22
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, 0, v23, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, 0, v24, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v1, v18, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v16, v3, vcc
+; GFX9-NEXT:    v_not_b32_e32 v8, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v17, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[10:11]
+; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v8
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v10, 31, v1
-; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v24
-; GFX9-NEXT:    v_subrev_u32_e32 v23, 64, v24
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v23
+; GFX9-NEXT:    v_subrev_u32_e32 v24, 64, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v23, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v24, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v23, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v23, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v24, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX9-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT:    v_or_b32_e32 v2, v18, v2
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX9-NEXT:    v_not_b32_e32 v8, v20
-; GFX9-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v18
-; GFX9-NEXT:    v_subrev_u32_e32 v20, 64, v18
+; GFX9-NEXT:    v_or_b32_e32 v3, v19, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v19, 64, v18
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v18, v[4:5]
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v20, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v19, v[4:5]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
@@ -7833,89 +7915,91 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
 ; GFX9-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v8, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v8, v7, vcc
+; GFX9-NEXT:    v_not_b32_e32 v8, v20
 ; GFX9-NEXT:    v_lshl_or_b32 v5, v14, 31, v5
-; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v19
-; GFX9-NEXT:    v_subrev_u32_e32 v14, 64, v19
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v19, v[4:5]
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
+; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v14
+; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v14, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
-; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v14, v[6:7]
+; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v14, v[6:7]
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v15, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX9-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX9-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, v22, v1
 ; GFX9-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX9-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, v19, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0x7f, v16
-; GFX10-NEXT:    v_not_b32_e32 v16, v16
+; GFX10-NEXT:    v_not_b32_e32 v21, v16
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
-; GFX10-NEXT:    v_and_b32_e32 v28, 0x7f, v16
+; GFX10-NEXT:    v_and_b32_e32 v28, 0x7f, v21
 ; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
 ; GFX10-NEXT:    v_lshl_or_b32 v9, v10, 31, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
-; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
 ; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
 ; GFX10-NEXT:    v_or_b32_e32 v18, v16, v18
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
 ; GFX10-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v19, v17, v19
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v28
-; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v23, v23, v25
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v0, v18, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v0, v24, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v1, v19, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v27
+; GFX10-NEXT:    v_or_b32_e32 v24, v24, v26
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v28
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v27
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v28
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v0, s4
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v23, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v17, v24, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v19, v3, s4
 ; GFX10-NEXT:    v_and_b32_e32 v24, 0x7f, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v19, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v16, v8, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v17, v9, s5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v9, s5
+; GFX10-NEXT:    v_not_b32_e32 v16, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v0, v21, v3
-; GFX10-NEXT:    v_not_b32_e32 v3, v20
 ; GFX10-NEXT:    v_or_b32_e32 v1, v22, v8
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v11, 64, v24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 64, v24
+; GFX10-NEXT:    v_and_b32_e32 v22, 0x7f, v16
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX10-NEXT:    v_and_b32_e32 v22, 0x7f, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[12:13], v24, v[6:7]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
-; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v11, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
 ; GFX10-NEXT:    v_lshl_or_b32 v9, v14, 31, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v22
-; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX10-NEXT:    v_or_b32_e32 v12, v10, v12
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v22
@@ -7953,88 +8037,87 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v27, 0x7f, v16
-; GFX11-NEXT:    v_not_b32_e32 v16, v16
+; GFX11-NEXT:    v_not_b32_e32 v21, v16
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0x7f, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v28, 0x7f, v21
+; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshl_or_b32 v9, v10, 31, v9
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX11-NEXT:    v_dual_cndmask_b32 v21, 0, v21 :: v_dual_cndmask_b32 v22, 0, v22
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc_lo
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
 ; GFX11-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v28
-; GFX11-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v19, v17, v19
 ; GFX11-NEXT:    v_or_b32_e32 v18, v16, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v19, v1, v19 :: v_dual_cndmask_b32 v18, v0, v18
+; GFX11-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
-; GFX11-NEXT:    v_or_b32_e32 v19, v17, v19
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v0, v18, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v28
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
+; GFX11-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
 ; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v1, v19, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v0, v24, v26
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v27
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v28
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v23, v23, v25
+; GFX11-NEXT:    v_or_b32_e32 v24, v24, v26
+; GFX11-NEXT:    v_dual_cndmask_b32 v25, 0, v1 :: v_dual_cndmask_b32 v16, v16, v23
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v17, v24, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v23, v19, v3, s0
 ; GFX11-NEXT:    v_and_b32_e32 v24, 0x7f, v20
-; GFX11-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v17, v17, v0, s0
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v23, v19, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v16, v8, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v17, v9, s1
-; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 64, v24
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v25, 0, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v10, v9, s1
+; GFX11-NEXT:    v_not_b32_e32 v16, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX11-NEXT:    v_or_b32_e32 v0, v21, v3
-; GFX11-NEXT:    v_not_b32_e32 v3, v20
 ; GFX11-NEXT:    v_or_b32_e32 v1, v22, v8
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT:    v_and_b32_e32 v22, 0x7f, v16
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v11, v[4:5]
 ; GFX11-NEXT:    v_lshlrev_b64 v[12:13], v24, v[6:7]
 ; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
-; GFX11-NEXT:    v_and_b32_e32 v22, 0x7f, v3
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
 ; GFX11-NEXT:    v_lshl_or_b32 v9, v14, 31, v9
 ; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]
-; GFX11-NEXT:    v_or_b32_e32 v12, v10, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX11-NEXT:    v_or_b32_e32 v5, v11, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0, v16, vcc_lo
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v20, 64, v22
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v22
+; GFX11-NEXT:    v_or_b32_e32 v12, v10, v12
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v22
 ; GFX11-NEXT:    v_lshrrev_b64 v[18:19], v22, v[8:9]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v22
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v3, v12, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
+; GFX11-NEXT:    v_or_b32_e32 v5, v11, v13
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[14:15]
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], v22, v[14:15]
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0, v16, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v22
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v24
 ; GFX11-NEXT:    v_or_b32_e32 v16, v18, v20
 ; GFX11-NEXT:    v_or_b32_e32 v18, v19, v21
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v3, v12 :: v_dual_cndmask_b32 v5, v4, v5
+; GFX11-NEXT:    v_lshrrev_b64 v[3:4], v22, v[14:15]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v24
 ; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, v18, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 8538dcabca924..58304d2072d7f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -347,49 +347,57 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ;
 ; GFX8-LABEL: s_fshr_i8:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_andn2_b32 s3, 7, s2
+; GFX8-NEXT:    s_and_b32 s2, s2, 7
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_and_b32 s3, s2, 7
-; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i8:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_andn2_b32 s3, 7, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 7
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_and_b32 s3, s2, 7
-; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX9-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i8:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_andn2_b32 s3, 7, s2
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_and_b32 s3, s2, 7
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_and_b32 s2, s2, 7
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i8:
 ; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_and_not1_b32 s3, 7, s2
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_and_b32 s3, s2, 7
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_b32 s2, s2, 7
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -414,33 +422,33 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX8-LABEL: v_fshr_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, 7, v2
-; GFX8-NEXT:    v_not_b32_e32 v2, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 7, v2
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_not_b32_e32 v3, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
@@ -451,9 +459,9 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX11-LABEL: v_fshr_i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_not_b32_e32 v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
@@ -687,25 +695,29 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ;
 ; GFX8-LABEL: s_fshr_v2i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 8
-; GFX8-NEXT:    s_and_b32 s6, s2, 7
-; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_andn2_b32 s6, 7, s2
+; GFX8-NEXT:    s_and_b32 s2, s2, 7
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s5
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX8-NEXT:    s_lshl_b32 s2, s3, s2
-; GFX8-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s5, 7
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s5, 7
+; GFX8-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_lshr_b32 s1, s3, s1
-; GFX8-NEXT:    s_or_b32 s1, s2, s1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
@@ -714,25 +726,29 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ;
 ; GFX9-LABEL: s_fshr_v2i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX9-NEXT:    s_lshr_b32 s5, s2, 8
-; GFX9-NEXT:    s_and_b32 s6, s2, 7
-; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_andn2_b32 s6, 7, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 7
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s5
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX9-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
-; GFX9-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s5, 7
+; GFX9-NEXT:    s_lshl_b32 s1, s3, 1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s5, 7
+; GFX9-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_lshr_b32 s1, s3, s1
-; GFX9-NEXT:    s_or_b32 s1, s2, s1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
@@ -741,24 +757,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ;
 ; GFX10-LABEL: s_fshr_v2i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX10-NEXT:    s_andn2_b32 s5, 7, s2
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
-; GFX10-NEXT:    s_and_b32 s6, s2, 7
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX10-NEXT:    s_andn2_b32 s5, 7, s6
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX10-NEXT:    s_and_b32 s6, s6, 7
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX10-NEXT:    s_and_b32 s2, s5, 7
-; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
+; GFX10-NEXT:    s_and_b32 s2, s2, 7
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, s5
-; GFX10-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    s_or_b32 s2, s3, s4
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
@@ -768,24 +788,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ;
 ; GFX11-LABEL: s_fshr_v2i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX11-NEXT:    s_and_not1_b32 s5, 7, s2
 ; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s2, 8
-; GFX11-NEXT:    s_and_b32 s6, s2, 7
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX11-NEXT:    s_and_not1_b32 s5, 7, s6
 ; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_and_b32 s6, s6, 7
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX11-NEXT:    s_and_b32 s2, s5, 7
-; GFX11-NEXT:    s_and_not1_b32 s5, 7, s5
+; GFX11-NEXT:    s_and_b32 s2, s2, 7
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, s5
-; GFX11-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX11-NEXT:    s_or_b32 s2, s3, s4
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
@@ -832,23 +856,23 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX8-LABEL: v_fshr_v2i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX8-NEXT:    v_not_b32_e32 v2, v2
+; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_not_b32_e32 v2, v5
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 7, v5
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
-; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v2, v3
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v2, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v5
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -857,23 +881,23 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX9-LABEL: v_fshr_v2i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
+; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_not_b32_e32 v2, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 7, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, v2, v3
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v5
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -885,20 +909,20 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX10-NEXT:    v_not_b32_e32 v2, v2
-; GFX10-NEXT:    v_not_b32_e32 v7, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v3
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX10-NEXT:    v_lshrrev_b16 v3, v3, v5
-; GFX10-NEXT:    v_lshlrev_b16 v4, v7, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v6, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
+; GFX10-NEXT:    v_lshlrev_b16 v4, v6, v4
+; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
+; GFX10-NEXT:    v_lshlrev_b16 v0, v7, v0
 ; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -912,22 +936,22 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v7, 7, v2
-; GFX11-NEXT:    v_not_b32_e32 v2, v2
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT:    v_xor_b32_e32 v6, -1, v3
 ; GFX11-NEXT:    v_lshlrev_b16 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX11-NEXT:    v_lshrrev_b16 v3, v3, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshlrev_b16 v4, v6, v4
-; GFX11-NEXT:    v_lshrrev_b16 v1, v7, v1
+; GFX11-NEXT:    v_lshrrev_b16 v1, v2, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b16 v0, v2, v0
+; GFX11-NEXT:    v_lshlrev_b16 v0, v7, v0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -997,50 +1021,58 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ;
 ; GFX8-LABEL: s_fshr_v4i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX8-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX8-NEXT:    s_and_b32 s12, s2, 7
-; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_andn2_b32 s12, 7, s2
+; GFX8-NEXT:    s_and_b32 s2, s2, 7
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s9
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s1, s1, s12
-; GFX8-NEXT:    s_lshl_b32 s2, s3, s2
-; GFX8-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s9, 7
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s9, 7
+; GFX8-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_lshr_b32 s1, s3, s1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX8-NEXT:    s_andn2_b32 s3, 7, s10
-; GFX8-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX8-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_lshl_b32 s2, s4, 1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX8-NEXT:    s_and_b32 s3, s10, 7
 ; GFX8-NEXT:    s_and_b32 s4, s7, 0xff
-; GFX8-NEXT:    s_or_b32 s1, s2, s1
-; GFX8-NEXT:    s_and_b32 s2, s10, 7
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_or_b32 s2, s3, s2
-; GFX8-NEXT:    s_and_b32 s3, s11, 7
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_lshr_b32 s3, s4, s3
 ; GFX8-NEXT:    s_andn2_b32 s4, 7, s11
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX8-NEXT:    s_or_b32 s2, s2, s3
+; GFX8-NEXT:    s_lshl_b32 s3, s5, 1
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX8-NEXT:    s_and_b32 s4, s11, 7
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX8-NEXT:    s_lshr_b32 s3, s8, s3
+; GFX8-NEXT:    s_lshr_b32 s4, s8, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX8-NEXT:    s_or_b32 s3, s4, s3
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
@@ -1050,50 +1082,58 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ;
 ; GFX9-LABEL: s_fshr_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX9-NEXT:    s_and_b32 s12, s2, 7
-; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_andn2_b32 s12, 7, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 7
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s9
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX9-NEXT:    s_lshr_b32 s1, s1, s12
-; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
-; GFX9-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s9, 7
+; GFX9-NEXT:    s_lshl_b32 s1, s3, 1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s9, 7
+; GFX9-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_lshr_b32 s1, s3, s1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX9-NEXT:    s_andn2_b32 s3, 7, s10
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX9-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_lshl_b32 s2, s4, 1
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX9-NEXT:    s_and_b32 s3, s10, 7
 ; GFX9-NEXT:    s_and_b32 s4, s7, 0xff
-; GFX9-NEXT:    s_or_b32 s1, s2, s1
-; GFX9-NEXT:    s_and_b32 s2, s10, 7
 ; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX9-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_or_b32 s2, s3, s2
-; GFX9-NEXT:    s_and_b32 s3, s11, 7
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s4, s3
 ; GFX9-NEXT:    s_andn2_b32 s4, 7, s11
-; GFX9-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX9-NEXT:    s_or_b32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s3, s5, 1
+; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX9-NEXT:    s_and_b32 s4, s11, 7
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX9-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX9-NEXT:    s_lshr_b32 s3, s8, s3
+; GFX9-NEXT:    s_lshr_b32 s4, s8, s4
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX9-NEXT:    s_or_b32 s3, s4, s3
+; GFX9-NEXT:    s_or_b32 s3, s3, s4
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s3, 0xff
@@ -1104,43 +1144,51 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-LABEL: s_fshr_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX10-NEXT:    s_and_b32 s12, s2, 7
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_andn2_b32 s12, 7, s2
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX10-NEXT:    s_and_b32 s2, s2, 7
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX10-NEXT:    s_and_b32 s2, s9, 7
-; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s9
+; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX10-NEXT:    s_and_b32 s9, s9, 7
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s12
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s9
-; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX10-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX10-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX10-NEXT:    s_lshr_b32 s3, s6, s9
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_or_b32 s1, s3, s2
-; GFX10-NEXT:    s_and_b32 s2, s10, 7
-; GFX10-NEXT:    s_andn2_b32 s3, 7, s10
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX10-NEXT:    s_or_b32 s1, s2, s3
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s10
+; GFX10-NEXT:    s_lshl_b32 s3, s4, 1
+; GFX10-NEXT:    s_and_b32 s4, s7, 0xff
+; GFX10-NEXT:    s_and_b32 s6, s10, 7
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX10-NEXT:    s_andn2_b32 s4, 7, s11
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX10-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX10-NEXT:    s_lshr_b32 s3, s4, s6
+; GFX10-NEXT:    s_lshl_b32 s4, s5, 1
+; GFX10-NEXT:    s_andn2_b32 s5, 7, s11
 ; GFX10-NEXT:    s_and_b32 s6, s11, 7
-; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_lshl_b32 s4, s4, s5
 ; GFX10-NEXT:    s_lshr_b32 s5, s8, s6
-; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_or_b32 s2, s2, s3
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_or_b32 s3, s4, s5
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
@@ -1157,43 +1205,51 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX11-LABEL: s_fshr_v4i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX11-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX11-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX11-NEXT:    s_and_b32 s12, s2, 7
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_not1_b32 s12, 7, s2
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX11-NEXT:    s_and_b32 s2, s2, 7
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX11-NEXT:    s_and_b32 s2, s9, 7
-; GFX11-NEXT:    s_and_not1_b32 s9, 7, s9
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX11-NEXT:    s_and_not1_b32 s2, 7, s9
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX11-NEXT:    s_and_b32 s9, s9, 7
+; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s12
-; GFX11-NEXT:    s_lshl_b32 s3, s3, s9
-; GFX11-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX11-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX11-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX11-NEXT:    s_lshr_b32 s3, s6, s9
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-NEXT:    s_and_b32 s2, s10, 7
-; GFX11-NEXT:    s_and_not1_b32 s3, 7, s10
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX11-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-NEXT:    s_and_not1_b32 s2, 7, s10
+; GFX11-NEXT:    s_lshl_b32 s3, s4, 1
+; GFX11-NEXT:    s_and_b32 s4, s7, 0xff
+; GFX11-NEXT:    s_and_b32 s6, s10, 7
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX11-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX11-NEXT:    s_and_not1_b32 s4, 7, s11
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX11-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX11-NEXT:    s_lshr_b32 s3, s4, s6
+; GFX11-NEXT:    s_lshl_b32 s4, s5, 1
+; GFX11-NEXT:    s_and_not1_b32 s5, 7, s11
 ; GFX11-NEXT:    s_and_b32 s6, s11, 7
-; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_lshl_b32 s4, s4, s5
 ; GFX11-NEXT:    s_lshr_b32 s5, s8, s6
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX11-NEXT:    s_or_b32 s3, s4, s5
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
@@ -1272,40 +1328,41 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-LABEL: v_fshr_v4i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_not_b32_e32 v7, v2
-; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v7, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX8-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX8-NEXT:    v_and_b32_e32 v7, 7, v5
-; GFX8-NEXT:    v_not_b32_e32 v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v5, v3
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v7, v3
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v7, -1
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, 7
+; GFX8-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8-NEXT:    v_xor_b32_sdwa v9, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v9, 1
-; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v5, v9, v5
+; GFX8-NEXT:    v_mov_b32_e32 v9, 7
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_xor_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT:    v_lshlrev_b16_e32 v7, v7, v10
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v8
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT:    v_and_b32_e32 v4, 7, v4
+; GFX8-NEXT:    v_and_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b16_e32 v8, v10, v8
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v8
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -1321,40 +1378,41 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-LABEL: v_fshr_v4i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_not_b32_e32 v7, v2
-; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 1, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v7, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX9-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 7, v5
-; GFX9-NEXT:    v_not_b32_e32 v5, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v5, v3
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v7, v3
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_mov_b32_e32 v7, -1
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 7
-; GFX9-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mov_b32_e32 v9, 1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 1
+; GFX9-NEXT:    v_xor_b32_sdwa v9, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v10
-; GFX9-NEXT:    v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_lshrrev_b16_e32 v5, v5, v10
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v5, v9, v5
+; GFX9-NEXT:    v_mov_b32_e32 v9, 7
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_xor_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v11, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 7, v4
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b16_e32 v10, v10, v11
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v10
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -1371,45 +1429,46 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX10-NEXT:    v_not_b32_e32 v8, v2
+; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; GFX10-NEXT:    v_not_b32_e32 v10, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v3, 7
 ; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX10-NEXT:    v_mov_b32_e32 v14, 0xff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
-; GFX10-NEXT:    v_not_b32_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_not_b32_sdwa v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xff, v9
 ; GFX10-NEXT:    v_lshlrev_b16 v4, v10, v4
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0xff
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
+; GFX10-NEXT:    v_xor_b32_sdwa v9, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mov_b32_e32 v10, 7
+; GFX10-NEXT:    v_xor_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_e32 v12, 7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX10-NEXT:    v_and_b32_sdwa v15, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v14
+; GFX10-NEXT:    v_and_b32_e32 v9, 7, v9
+; GFX10-NEXT:    v_and_b32_sdwa v15, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b16 v7, 1, v7
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v9
-; GFX10-NEXT:    v_lshlrev_b16 v5, v8, v6
+; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b16 v5, v5, v8
+; GFX10-NEXT:    v_lshlrev_b16 v6, v9, v6
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v15, v1
-; GFX10-NEXT:    v_lshlrev_b16 v6, v10, v7
+; GFX10-NEXT:    v_lshlrev_b16 v3, v3, v7
 ; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v11
 ; GFX10-NEXT:    v_lshrrev_b16 v7, v12, v13
-; GFX10-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, 8
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
@@ -1427,29 +1486,29 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_not_b32_e32 v12, v7
+; GFX11-NEXT:    v_xor_b32_e32 v12, -1, v7
 ; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v12, 7, v12
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 1, v3
-; GFX11-NEXT:    v_not_b32_e32 v14, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 7, v12
+; GFX11-NEXT:    v_xor_b32_e32 v14, -1, v11
 ; GFX11-NEXT:    v_lshrrev_b16 v6, v7, v6
-; GFX11-NEXT:    v_not_b32_e32 v7, v13
+; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v13
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX11-NEXT:    v_not_b32_e32 v10, v2
+; GFX11-NEXT:    v_xor_b32_e32 v10, -1, v2
 ; GFX11-NEXT:    v_lshlrev_b16 v3, v12, v3
-; GFX11-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 7, v14
 ; GFX11-NEXT:    v_lshlrev_b16 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v12, 7, v14
+; GFX11-NEXT:    v_and_b32_e32 v11, 7, v11
 ; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX11-NEXT:    v_and_b32_e32 v13, 7, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX11-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX11-NEXT:    v_lshlrev_b16 v4, v12, v4
@@ -5112,51 +5171,46 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 define amdgpu_ps i64 @s_fshr_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) {
 ; GFX6-LABEL: s_fshr_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 63
-; GFX6-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
-; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
+; GFX6-NEXT:    s_not_b32 s5, s4
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 63
-; GFX8-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
-; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
+; GFX8-NEXT:    s_not_b32 s5, s4
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 63
-; GFX9-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
-; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
+; GFX9-NEXT:    s_not_b32 s5, s4
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_andn2_b64 s[6:7], 63, s[4:5]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], 63
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s6
+; GFX10-NEXT:    s_not_b32 s5, s4
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_not1_b64 s[6:7], 63, s[4:5]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_and_b64 s[4:5], s[4:5], 63
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s6
+; GFX11-NEXT:    s_not_b32 s5, s4
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -5233,12 +5287,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX6-LABEL: v_fshr_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX6-NEXT:    v_not_b32_e32 v4, v4
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    v_not_b32_e32 v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v5, 63, v5
 ; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v5
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v5
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -5246,12 +5300,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX8-LABEL: v_fshr_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    v_not_b32_e32 v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 63, v5
 ; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -5259,12 +5313,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX9-LABEL: v_fshr_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX9-NEXT:    v_not_b32_e32 v4, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    v_not_b32_e32 v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 63, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -5410,38 +5464,38 @@ define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) {
 define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) {
 ; GFX6-LABEL: v_fshr_i64_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX6-NEXT:    v_not_b32_e32 v0, v0
-; GFX6-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT:    v_not_b32_e32 v1, v0
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v0
-; GFX6-NEXT:    v_lshr_b64 v[2:3], s[2:3], v2
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 63, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT:    v_lshl_b64 v[1:2], s[0:1], v1
+; GFX6-NEXT:    v_lshr_b64 v[3:4], s[2:3], v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i64_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT:    v_not_b32_e32 v1, v0
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v2, s[2:3]
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 63, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v1, s[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[3:4], v0, s[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i64_ssv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_and_b32_e32 v2, 63, v0
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT:    v_not_b32_e32 v1, v0
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v2, s[2:3]
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 63, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v1, s[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[3:4], v0, s[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX9-NEXT:    v_or_b32_e32 v1, v2, v4
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i64_ssv:
@@ -5478,43 +5532,43 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
 define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i64_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX6-NEXT:    s_not_b32 s3, s2
+; GFX6-NEXT:    s_and_b32 s2, s2, 63
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], s4
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], s2
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i64_svs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX8-NEXT:    s_not_b32 s3, s2
+; GFX8-NEXT:    s_and_b32 s2, s2, 63
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s2, v[0:1]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i64_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX9-NEXT:    s_not_b32 s3, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 63
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s2, v[0:1]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i64_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX10-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    s_and_b32 s3, s2, 63
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX10-NEXT:    s_not_b32 s2, s2
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -5522,10 +5576,10 @@ define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
 ;
 ; GFX11-LABEL: v_fshr_i64_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX11-NEXT:    s_and_not1_b64 s[2:3], 63, s[2:3]
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX11-NEXT:    s_and_b32 s3, s2, 63
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX11-NEXT:    s_not_b32 s2, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -5542,10 +5596,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX6-LABEL: v_fshr_i64_vss:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s2
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s3
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -5553,10 +5606,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX8-LABEL: v_fshr_i64_vss:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s2, v[0:1]
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -5564,10 +5616,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX9-LABEL: v_fshr_i64_vss:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s2, v[0:1]
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT:    s_andn2_b32 s3, 63, s2
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -5575,10 +5626,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX10-LABEL: v_fshr_i64_vss:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 63, s[2:3]
-; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], 63
+; GFX10-NEXT:    s_andn2_b32 s3, 63, s2
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -5586,13 +5636,12 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX11-LABEL: v_fshr_i64_vss:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 63, s[2:3]
-; GFX11-NEXT:    s_and_b64 s[2:3], s[2:3], 63
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_and_not1_b32 s3, 63, s2
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s3, v[0:1]
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5603,63 +5652,55 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX6-NEXT:    s_not_b32 s9, s8
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; GFX6-NEXT:    s_not_b32 s4, s10
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], s10
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX8-NEXT:    s_not_b32 s9, s8
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; GFX8-NEXT:    s_not_b32 s4, s10
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], s10
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], 63
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX9-NEXT:    s_not_b32 s9, s8
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; GFX9-NEXT:    s_not_b32 s4, s10
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], s10
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_andn2_b64 s[12:13], 63, s[8:9]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[8:9], 63
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
-; GFX10-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX10-NEXT:    s_not_b32 s9, s8
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_and_b64 s[10:11], s[10:11], 63
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX10-NEXT:    s_not_b32 s9, s10
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], s9
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
@@ -5667,15 +5708,13 @@ define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %
 ;
 ; GFX11-LABEL: s_fshr_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_not1_b64 s[12:13], 63, s[8:9]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[8:9], 63
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], 63, s[10:11]
+; GFX11-NEXT:    s_not_b32 s9, s8
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_and_b64 s[10:11], s[10:11], 63
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX11-NEXT:    s_not_b32 s9, s10
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s9
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
@@ -5688,18 +5727,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX6-LABEL: v_fshr_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    v_not_b32_e32 v9, v8
+; GFX6-NEXT:    v_and_b32_e32 v9, 63, v9
 ; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v9
-; GFX6-NEXT:    v_not_b32_e32 v8, v10
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v9
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX6-NEXT:    v_not_b32_e32 v4, v10
+; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], v8
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5709,18 +5748,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX8-LABEL: v_fshr_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    v_not_b32_e32 v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 63, v9
 ; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
-; GFX8-NEXT:    v_not_b32_e32 v8, v10
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_not_b32_e32 v4, v10
+; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
 ; GFX8-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5730,18 +5769,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX9-LABEL: v_fshr_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    v_not_b32_e32 v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 63, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
-; GFX9-NEXT:    v_not_b32_e32 v8, v10
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX9-NEXT:    v_not_b32_e32 v4, v10
+; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
 ; GFX9-NEXT:    v_and_b32_e32 v4, 63, v10
-; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5800,231 +5839,237 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: s_fshr_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_sub_i32 s11, s8, 64
-; GFX6-NEXT:    s_sub_i32 s9, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX6-NEXT:    s_not_b32 s9, s8
+; GFX6-NEXT:    s_sub_i32 s16, s2, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s2
+; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX6-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
+; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX6-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    s_sub_i32 s14, s10, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s10
-; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
+; GFX6-NEXT:    s_and_b32 s0, s8, 0x7f
+; GFX6-NEXT:    s_sub_i32 s14, s0, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_sub_i32 s11, s8, 64
-; GFX8-NEXT:    s_sub_i32 s9, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX8-NEXT:    s_not_b32 s9, s8
+; GFX8-NEXT:    s_sub_i32 s16, s2, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s2
+; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX8-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX8-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX8-NEXT:    s_sub_i32 s14, s10, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s10
-; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
+; GFX8-NEXT:    s_and_b32 s0, s8, 0x7f
+; GFX8-NEXT:    s_sub_i32 s14, s0, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_sub_i32 s11, s8, 64
-; GFX9-NEXT:    s_sub_i32 s9, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX9-NEXT:    s_not_b32 s9, s8
+; GFX9-NEXT:    s_sub_i32 s16, s2, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s2
+; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX9-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX9-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX9-NEXT:    s_sub_i32 s14, s10, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s10
-; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
+; GFX9-NEXT:    s_and_b32 s0, s8, 0x7f
+; GFX9-NEXT:    s_sub_i32 s14, s0, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s12, s1, 31
-; GFX10-NEXT:    s_mov_b32 s13, 0
+; GFX10-NEXT:    s_lshr_b32 s10, s1, 31
+; GFX10-NEXT:    s_mov_b32 s11, 0
+; GFX10-NEXT:    s_andn2_b32 s9, 0x7f, s8
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[12:13]
-; GFX10-NEXT:    s_sub_i32 s11, s8, 64
-; GFX10-NEXT:    s_sub_i32 s9, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT:    s_not_b32 s14, s8
+; GFX10-NEXT:    s_sub_i32 s16, s9, 64
+; GFX10-NEXT:    s_sub_i32 s10, 64, s9
+; GFX10-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
-; GFX10-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[2:3], s14
+; GFX10-NEXT:    s_lshl_b64 s[14:15], s[0:1], s14
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s16
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b64 s[12:13], s[14:15], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s14, s10, 64
-; GFX10-NEXT:    s_sub_i32 s11, 64, s10
-; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX10-NEXT:    s_and_b32 s0, s8, 0x7f
+; GFX10-NEXT:    s_sub_i32 s14, s0, 64
+; GFX10-NEXT:    s_sub_i32 s9, 64, s0
+; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
-; GFX10-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s8
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s9
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
-; GFX10-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], 0x7f, s[8:9]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s12, s1, 31
-; GFX11-NEXT:    s_mov_b32 s13, 0
+; GFX11-NEXT:    s_lshr_b32 s10, s1, 31
+; GFX11-NEXT:    s_mov_b32 s11, 0
+; GFX11-NEXT:    s_and_not1_b32 s9, 0x7f, s8
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[12:13]
-; GFX11-NEXT:    s_sub_i32 s11, s8, 64
-; GFX11-NEXT:    s_sub_i32 s9, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX11-NEXT:    s_not_b32 s14, s8
+; GFX11-NEXT:    s_sub_i32 s16, s9, 64
+; GFX11-NEXT:    s_sub_i32 s10, 64, s9
+; GFX11-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
-; GFX11-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX11-NEXT:    s_lshl_b64 s[12:13], s[2:3], s14
+; GFX11-NEXT:    s_lshl_b64 s[14:15], s[0:1], s14
+; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s16
 ; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX11-NEXT:    s_cselect_b64 s[12:13], s[14:15], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s14, s10, 64
-; GFX11-NEXT:    s_sub_i32 s11, 64, s10
-; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX11-NEXT:    s_and_b32 s0, s8, 0x7f
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s14, s0, 64
+; GFX11-NEXT:    s_sub_i32 s9, 64, s0
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
-; GFX11-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], s8
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s9
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
-; GFX11-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX11-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6035,29 +6080,29 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshr_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], 1
+; GFX6-NEXT:    v_lshl_b64 v[9:10], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX6-NEXT:    v_not_b32_e32 v0, v8
+; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v15
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v0
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v15
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[9:10], v0
+; GFX6-NEXT:    v_lshl_b64 v[11:12], v[2:3], v15
 ; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v15
-; GFX6-NEXT:    v_lshl_b64 v[12:13], v[8:9], v15
-; GFX6-NEXT:    v_or_b32_e32 v10, v0, v10
-; GFX6-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[8:9], v16
+; GFX6-NEXT:    v_lshl_b64 v[13:14], v[9:10], v15
+; GFX6-NEXT:    v_or_b32_e32 v11, v0, v11
+; GFX6-NEXT:    v_or_b32_e32 v12, v1, v12
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[9:10], v16
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v13, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v13, 0, v14, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
+; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX6-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v12, v1, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v14
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], v14
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[6:7], v2
@@ -6074,38 +6119,38 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v10, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v12, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[9:10], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX8-NEXT:    v_not_b32_e32 v0, v8
+; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v15
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v15, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[9:10]
+; GFX8-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
 ; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v15
-; GFX8-NEXT:    v_lshlrev_b64 v[12:13], v15, v[8:9]
-; GFX8-NEXT:    v_or_b32_e32 v10, v0, v10
-; GFX8-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[13:14], v15, v[9:10]
+; GFX8-NEXT:    v_or_b32_e32 v11, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v12, v1, v12
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[9:10]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, 0, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
+; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v1, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v14
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
@@ -6122,39 +6167,39 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v10, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v12, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
-; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[9:10], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX9-NEXT:    v_not_b32_e32 v0, v8
+; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v15
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v15, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[9:10]
+; GFX9-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
 ; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v15
-; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v15, v[8:9]
-; GFX9-NEXT:    v_or_b32_e32 v10, v0, v10
-; GFX9-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v16, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[13:14], v15, v[9:10]
+; GFX9-NEXT:    v_or_b32_e32 v11, v0, v11
+; GFX9-NEXT:    v_or_b32_e32 v12, v1, v12
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v16, v[9:10]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, 0, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v1, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
 ; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
@@ -6170,10 +6215,10 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v10, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX9-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v12, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i128:
@@ -6282,158 +6327,158 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshr_i128_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX6-NEXT:    v_not_b32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s9, 0
-; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX6-NEXT:    v_not_b32_e32 v1, v0
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v7
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[10:11], v0
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[0:1], v7
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX6-NEXT:    s_mov_b32 s1, 0
+; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v1
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 64, v7
+; GFX6-NEXT:    v_lshr_b64 v[1:2], s[8:9], v1
+; GFX6-NEXT:    v_lshl_b64 v[3:4], s[0:1], v7
 ; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v7
-; GFX6-NEXT:    v_lshl_b64 v[4:5], s[10:11], v7
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_lshl_b64 v[5:6], s[8:9], v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[10:11], v8
+; GFX6-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX6-NEXT:    v_lshl_b64 v[1:2], s[8:9], v8
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v3, s0
+; GFX6-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v6
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v6
+; GFX6-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v10
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v10
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], s[6:7], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v6
+; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v10
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[6:7], v11
-; GFX6-NEXT:    v_lshr_b64 v[4:5], s[6:7], v6
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX6-NEXT:    v_lshr_b64 v[4:5], s[6:7], v10
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    s_mov_b32 s9, 0
-; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX8-NEXT:    v_not_b32_e32 v1, v0
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v7
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[10:11]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v1
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 64, v7
+; GFX8-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
 ; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v7, s[10:11]
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[10:11]
+; GFX8-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, s[8:9]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v6
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v6, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v10
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v10, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v6
+; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v10
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v10, s[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_ssv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_and_b32_e32 v6, 0x7f, v0
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s9, 0
-; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX9-NEXT:    v_not_b32_e32 v1, v0
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v7
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[10:11]
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
 ; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v7, s[10:11]
-; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[10:11]
+; GFX9-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, s[8:9]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v6, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7f, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v10, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v10
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v10, s[6:7]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX9-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX9-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_ssv:
@@ -6543,40 +6588,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i128_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_sub_i32 s7, s4, 64
-; GFX6-NEXT:    s_sub_i32 s5, 64, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX6-NEXT:    s_not_b32 s5, s4
+; GFX6-NEXT:    s_sub_i32 s12, s2, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s2
+; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
-; GFX6-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX6-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s0, s6, 64
-; GFX6-NEXT:    s_sub_i32 s1, 64, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s6
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX6-NEXT:    s_and_b32 s0, s4, 0x7f
+; GFX6-NEXT:    s_sub_i32 s1, s0, 64
+; GFX6-NEXT:    s_sub_i32 s4, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s0
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
 ; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s6
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s7
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s0
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s1
+; GFX6-NEXT:    s_and_b32 s0, 1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
@@ -6590,46 +6636,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
 ; GFX6-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, s6, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_svs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_sub_i32 s7, s4, 64
-; GFX8-NEXT:    s_sub_i32 s5, 64, s4
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX8-NEXT:    s_not_b32 s5, s4
+; GFX8-NEXT:    s_sub_i32 s12, s2, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s2
+; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
-; GFX8-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX8-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s0, s6, 64
-; GFX8-NEXT:    s_sub_i32 s1, 64, s6
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT:    s_and_b32 s0, s4, 0x7f
+; GFX8-NEXT:    s_sub_i32 s1, s0, 64
+; GFX8-NEXT:    s_sub_i32 s4, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX8-NEXT:    s_and_b32 s0, 1, s7
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
+; GFX8-NEXT:    s_and_b32 s0, 1, s5
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
@@ -6643,46 +6690,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_sub_i32 s7, s4, 64
-; GFX9-NEXT:    s_sub_i32 s5, 64, s4
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX9-NEXT:    s_not_b32 s5, s4
+; GFX9-NEXT:    s_sub_i32 s12, s2, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s2
+; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX9-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s0, s6, 64
-; GFX9-NEXT:    s_sub_i32 s1, 64, s6
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX9-NEXT:    s_and_b32 s0, s4, 0x7f
+; GFX9-NEXT:    s_sub_i32 s1, s0, 64
+; GFX9-NEXT:    s_sub_i32 s4, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX9-NEXT:    s_and_b32 s0, 1, s7
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
+; GFX9-NEXT:    s_and_b32 s0, 1, s5
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
@@ -6696,50 +6744,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
 ; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX9-NEXT:    v_or_b32_e32 v2, s6, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:    s_lshr_b32 s6, s1, 31
+; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_sub_i32 s7, s4, 64
-; GFX10-NEXT:    s_sub_i32 s5, 64, s4
-; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT:    s_not_b32 s10, s4
+; GFX10-NEXT:    s_sub_i32 s12, s5, 64
+; GFX10-NEXT:    s_sub_i32 s6, 64, s5
+; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s10
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s0, 64, s6
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT:    s_sub_i32 s0, s6, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    s_and_b32 s0, s4, 0x7f
+; GFX10-NEXT:    s_sub_i32 s1, 64, s0
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX10-NEXT:    s_sub_i32 s1, s0, 64
+; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_and_b32 s1, 1, s4
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s7
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX10-NEXT:    s_and_b32 s0, 1, s5
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
@@ -6749,64 +6798,65 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX10-NEXT:    v_or_b32_e32 v0, s4, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, s5, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshr_i128_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX11-NEXT:    s_mov_b32 s9, 0
+; GFX11-NEXT:    s_lshr_b32 s6, s1, 31
+; GFX11-NEXT:    s_mov_b32 s7, 0
+; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_sub_i32 s7, s4, 64
-; GFX11-NEXT:    s_sub_i32 s5, 64, s4
-; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX11-NEXT:    s_not_b32 s10, s4
+; GFX11-NEXT:    s_sub_i32 s12, s5, 64
+; GFX11-NEXT:    s_sub_i32 s6, 64, s5
+; GFX11-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s10
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s0, 64, s6
+; GFX11-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT:    s_sub_i32 s0, s6, 64
-; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    s_sub_i32 s1, 64, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX11-NEXT:    s_sub_i32 s1, s0, 64
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX11-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX11-NEXT:    s_and_b32 s0, 1, s1
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_and_b32 s1, 1, s4
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s0, 1, s7
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX11-NEXT:    s_and_b32 s0, 1, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, s4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, s5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6816,51 +6866,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i128_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    s_sub_i32 s5, s4, 64
-; GFX6-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX6-NEXT:    s_sub_i32 s6, s5, 64
+; GFX6-NEXT:    s_sub_i32 s7, 64, s5
+; GFX6-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], s7
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s5
 ; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], s7
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], s4
-; GFX6-NEXT:    s_and_b32 s4, 1, s8
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX6-NEXT:    s_and_b32 s4, 1, s9
-; GFX6-NEXT:    s_sub_i32 s10, s6, 64
-; GFX6-NEXT:    s_sub_i32 s8, 64, s6
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], s5
 ; GFX6-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX6-NEXT:    v_or_b32_e32 v7, v1, v7
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[4:5], s5
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[4:5], s6
+; GFX6-NEXT:    s_and_b32 s5, 1, s8
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    s_and_b32 s5, 1, s9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX6-NEXT:    s_sub_i32 s10, s5, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s5
+; GFX6-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6869,51 +6919,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX8-LABEL: v_fshr_i128_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    s_sub_i32 s5, s4, 64
-; GFX8-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX8-NEXT:    s_sub_i32 s6, s5, 64
+; GFX8-NEXT:    s_sub_i32 s7, 64, s5
+; GFX8-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX8-NEXT:    s_and_b32 s4, 1, s8
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT:    s_and_b32 s4, 1, s9
-; GFX8-NEXT:    s_sub_i32 s10, s6, 64
-; GFX8-NEXT:    s_sub_i32 s8, 64, s6
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s5, v[4:5]
 ; GFX8-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, v1, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s6, v[4:5]
+; GFX8-NEXT:    s_and_b32 s5, 1, s8
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    s_and_b32 s5, 1, s9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX8-NEXT:    s_sub_i32 s10, s5, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s5
+; GFX8-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v4
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6922,51 +6972,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX9-LABEL: v_fshr_i128_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    s_sub_i32 s5, s4, 64
-; GFX9-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX9-NEXT:    s_sub_i32 s6, s5, 64
+; GFX9-NEXT:    s_sub_i32 s7, 64, s5
+; GFX9-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX9-NEXT:    s_and_b32 s4, 1, s8
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT:    s_and_b32 s4, 1, s9
-; GFX9-NEXT:    s_sub_i32 s10, s6, 64
-; GFX9-NEXT:    s_sub_i32 s8, 64, s6
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s5, v[4:5]
 ; GFX9-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v1, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s6, v[4:5]
+; GFX9-NEXT:    s_and_b32 s5, 1, s8
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    s_and_b32 s5, 1, s9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    s_and_b32 s5, s4, 0x7f
+; GFX9-NEXT:    s_sub_i32 s10, s5, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s5
+; GFX9-NEXT:    s_cmp_lt_u32 s5, 64
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v4
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6978,49 +7028,49 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT:    s_sub_i32 s7, 64, s4
+; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
+; GFX10-NEXT:    s_sub_i32 s6, s5, 64
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX10-NEXT:    s_sub_i32 s5, s4, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s5
+; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_and_b32 s4, 1, s8
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX10-NEXT:    s_and_b32 s5, 1, s8
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s6, v[0:1]
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX10-NEXT:    s_and_b32 s5, s4, 0x7f
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    s_and_b32 s4, 1, s9
-; GFX10-NEXT:    s_sub_i32 s10, s6, 64
-; GFX10-NEXT:    s_sub_i32 s7, 64, s6
-; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    s_and_b32 s6, 1, s9
+; GFX10-NEXT:    s_sub_i32 s10, s5, 64
+; GFX10-NEXT:    s_sub_i32 s8, 64, s5
+; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -7031,47 +7081,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s7, 64, s4
+; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_sub_i32 s6, s5, 64
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX11-NEXT:    s_sub_i32 s5, s4, 64
-; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s5
+; GFX11-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
 ; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    s_and_b32 s4, 1, s8
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX11-NEXT:    s_and_b32 s5, 1, s8
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s6, v[0:1]
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX11-NEXT:    s_and_b32 s5, s4, 0x7f
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    s_and_b32 s4, 1, s9
-; GFX11-NEXT:    s_sub_i32 s10, s6, 64
-; GFX11-NEXT:    s_sub_i32 s7, 64, s6
-; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX11-NEXT:    s_and_b32 s6, 1, s9
+; GFX11-NEXT:    s_sub_i32 s10, s5, 64
+; GFX11-NEXT:    s_sub_i32 s8, 64, s5
+; GFX11-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
 ; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -7209,435 +7259,447 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s24, s1, 31
-; GFX6-NEXT:    s_mov_b32 s25, 0
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
-; GFX6-NEXT:    s_sub_i32 s19, s16, 64
-; GFX6-NEXT:    s_sub_i32 s17, 64, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX6-NEXT:    s_lshr_b32 s22, s1, 31
+; GFX6-NEXT:    s_mov_b32 s23, 0
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
+; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX6-NEXT:    s_not_b32 s17, s16
+; GFX6-NEXT:    s_sub_i32 s21, s2, 64
+; GFX6-NEXT:    s_sub_i32 s22, 64, s2
+; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
-; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
+; GFX6-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
+; GFX6-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
 ; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX6-NEXT:    s_sub_i32 s24, s18, 64
-; GFX6-NEXT:    s_sub_i32 s22, 64, s18
-; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
+; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX6-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
+; GFX6-NEXT:    s_and_b32 s0, s16, 0x7f
+; GFX6-NEXT:    s_sub_i32 s21, s0, 64
+; GFX6-NEXT:    s_sub_i32 s22, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX6-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX6-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
 ; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX6-NEXT:    s_lshr_b32 s24, s5, 31
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
-; GFX6-NEXT:    s_sub_i32 s9, s10, 64
-; GFX6-NEXT:    s_sub_i32 s11, 64, s10
-; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX6-NEXT:    s_lshr_b32 s22, s5, 31
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
+; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
+; GFX6-NEXT:    s_andn2_b32 s6, 0x7f, s20
+; GFX6-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX6-NEXT:    s_not_b32 s16, s20
+; GFX6-NEXT:    s_sub_i32 s18, s6, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s6
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX6-NEXT:    s_sub_i32 s18, s8, 64
-; GFX6-NEXT:    s_sub_i32 s16, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
+; GFX6-NEXT:    s_and_b32 s4, s20, 0x7f
+; GFX6-NEXT:    s_sub_i32 s18, s4, 64
+; GFX6-NEXT:    s_sub_i32 s16, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[14:15]
+; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX6-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s24, s1, 31
-; GFX8-NEXT:    s_mov_b32 s25, 0
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
-; GFX8-NEXT:    s_sub_i32 s19, s16, 64
-; GFX8-NEXT:    s_sub_i32 s17, 64, s16
-; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX8-NEXT:    s_lshr_b32 s22, s1, 31
+; GFX8-NEXT:    s_mov_b32 s23, 0
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
+; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX8-NEXT:    s_not_b32 s17, s16
+; GFX8-NEXT:    s_sub_i32 s21, s2, 64
+; GFX8-NEXT:    s_sub_i32 s22, 64, s2
+; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
-; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
+; GFX8-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
+; GFX8-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
 ; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX8-NEXT:    s_sub_i32 s24, s18, 64
-; GFX8-NEXT:    s_sub_i32 s22, 64, s18
-; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
+; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
+; GFX8-NEXT:    s_and_b32 s0, s16, 0x7f
+; GFX8-NEXT:    s_sub_i32 s21, s0, 64
+; GFX8-NEXT:    s_sub_i32 s22, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX8-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX8-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
 ; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX8-NEXT:    s_lshr_b32 s24, s5, 31
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
-; GFX8-NEXT:    s_sub_i32 s9, s10, 64
-; GFX8-NEXT:    s_sub_i32 s11, 64, s10
-; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX8-NEXT:    s_lshr_b32 s22, s5, 31
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
+; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
+; GFX8-NEXT:    s_andn2_b32 s6, 0x7f, s20
+; GFX8-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX8-NEXT:    s_not_b32 s16, s20
+; GFX8-NEXT:    s_sub_i32 s18, s6, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s6
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX8-NEXT:    s_sub_i32 s18, s8, 64
-; GFX8-NEXT:    s_sub_i32 s16, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
+; GFX8-NEXT:    s_and_b32 s4, s20, 0x7f
+; GFX8-NEXT:    s_sub_i32 s18, s4, 64
+; GFX8-NEXT:    s_sub_i32 s16, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[14:15]
+; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX8-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX8-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s24, s1, 31
-; GFX9-NEXT:    s_mov_b32 s25, 0
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
-; GFX9-NEXT:    s_sub_i32 s19, s16, 64
-; GFX9-NEXT:    s_sub_i32 s17, 64, s16
-; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX9-NEXT:    s_lshr_b32 s22, s1, 31
+; GFX9-NEXT:    s_mov_b32 s23, 0
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
+; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX9-NEXT:    s_not_b32 s17, s16
+; GFX9-NEXT:    s_sub_i32 s21, s2, 64
+; GFX9-NEXT:    s_sub_i32 s22, 64, s2
+; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
 ; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
-; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
+; GFX9-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
+; GFX9-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
 ; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX9-NEXT:    s_sub_i32 s24, s18, 64
-; GFX9-NEXT:    s_sub_i32 s22, 64, s18
-; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
+; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX9-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
+; GFX9-NEXT:    s_and_b32 s0, s16, 0x7f
+; GFX9-NEXT:    s_sub_i32 s21, s0, 64
+; GFX9-NEXT:    s_sub_i32 s22, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX9-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX9-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
 ; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX9-NEXT:    s_lshr_b32 s24, s5, 31
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
-; GFX9-NEXT:    s_sub_i32 s9, s10, 64
-; GFX9-NEXT:    s_sub_i32 s11, 64, s10
-; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX9-NEXT:    s_lshr_b32 s22, s5, 31
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
+; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
+; GFX9-NEXT:    s_andn2_b32 s6, 0x7f, s20
+; GFX9-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX9-NEXT:    s_not_b32 s16, s20
+; GFX9-NEXT:    s_sub_i32 s18, s6, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s6
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX9-NEXT:    s_sub_i32 s18, s8, 64
-; GFX9-NEXT:    s_sub_i32 s16, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
+; GFX9-NEXT:    s_and_b32 s4, s20, 0x7f
+; GFX9-NEXT:    s_sub_i32 s18, s4, 64
+; GFX9-NEXT:    s_sub_i32 s16, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[14:15]
+; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX9-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX9-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX10-NEXT:    s_mov_b32 s23, 0
+; GFX10-NEXT:    s_lshr_b32 s18, s1, 31
+; GFX10-NEXT:    s_mov_b32 s19, 0
+; GFX10-NEXT:    s_andn2_b32 s17, 0x7f, s16
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s19, s16, 64
-; GFX10-NEXT:    s_sub_i32 s17, 64, s16
-; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
+; GFX10-NEXT:    s_not_b32 s18, s16
+; GFX10-NEXT:    s_sub_i32 s21, s17, 64
+; GFX10-NEXT:    s_sub_i32 s22, 64, s17
+; GFX10-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
-; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX10-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
+; GFX10-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
 ; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s22, s18, 64
-; GFX10-NEXT:    s_sub_i32 s19, 64, s18
-; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX10-NEXT:    s_and_b32 s0, s16, 0x7f
+; GFX10-NEXT:    s_sub_i32 s18, s0, 64
+; GFX10-NEXT:    s_sub_i32 s17, 64, s0
+; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
-; GFX10-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
+; GFX10-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
-; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT:    s_lshr_b32 s22, s5, 31
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX10-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
+; GFX10-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX10-NEXT:    s_andn2_b32 s8, 0x7f, s20
+; GFX10-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s9, s10, 64
-; GFX10-NEXT:    s_sub_i32 s11, 64, s10
-; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s11
-; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], s10
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s9
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
+; GFX10-NEXT:    s_not_b32 s16, s20
 ; GFX10-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[14:15], s9
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[14:15], s8
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[16:17]
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s9
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s16
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s18
+; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX10-NEXT:    s_and_b32 s4, s20, 0x7f
+; GFX10-NEXT:    s_sub_i32 s18, s4, 64
+; GFX10-NEXT:    s_sub_i32 s8, 64, s4
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], s20
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[14:15], s8
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[14:15], s20
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[14:15], s18
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[14:15]
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX10-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[16:17], 0x7f, s[16:17]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX11-NEXT:    s_mov_b32 s23, 0
+; GFX11-NEXT:    s_lshr_b32 s18, s1, 31
+; GFX11-NEXT:    s_mov_b32 s19, 0
+; GFX11-NEXT:    s_and_not1_b32 s17, 0x7f, s16
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s19, s16, 64
-; GFX11-NEXT:    s_sub_i32 s17, 64, s16
-; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
+; GFX11-NEXT:    s_not_b32 s18, s16
+; GFX11-NEXT:    s_sub_i32 s21, s17, 64
+; GFX11-NEXT:    s_sub_i32 s22, 64, s17
+; GFX11-NEXT:    s_cmp_lt_u32 s17, 64
 ; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
-; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX11-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX11-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
+; GFX11-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
+; GFX11-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s22, s18, 64
-; GFX11-NEXT:    s_sub_i32 s19, 64, s18
-; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX11-NEXT:    s_and_b32 s0, s16, 0x7f
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s18, s0, 64
+; GFX11-NEXT:    s_sub_i32 s17, 64, s0
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
-; GFX11-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
+; GFX11-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX11-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
+; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
-; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT:    s_lshr_b32 s22, s5, 31
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX11-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
+; GFX11-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX11-NEXT:    s_and_not1_b32 s8, 0x7f, s20
+; GFX11-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s9, s10, 64
-; GFX11-NEXT:    s_sub_i32 s11, 64, s10
-; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s11
-; GFX11-NEXT:    s_lshl_b64 s[18:19], s[6:7], s10
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s9
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
+; GFX11-NEXT:    s_not_b32 s16, s20
 ; GFX11-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[14:15], s9
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[14:15], s8
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[16:17]
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
+; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s9
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s16
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s18
+; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX11-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX11-NEXT:    s_and_b32 s4, s20, 0x7f
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s18, s4, 64
+; GFX11-NEXT:    s_sub_i32 s8, 64, s4
+; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], s20
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[14:15], s8
+; GFX11-NEXT:    s_lshr_b64 s[16:17], s[14:15], s20
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[14:15], s18
 ; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[14:15]
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
 ; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX11-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -7649,68 +7711,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-LABEL: v_fshr_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX6-NEXT:    v_not_b32_e32 v16, v16
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX6-NEXT:    v_lshl_b64 v[16:17], v[0:1], 1
+; GFX6-NEXT:    v_lshl_b64 v[17:18], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v24
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[16:17], v0
-; GFX6-NEXT:    v_lshl_b64 v[18:19], v[2:3], v24
-; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v24
-; GFX6-NEXT:    v_lshl_b64 v[21:22], v[16:17], v24
-; GFX6-NEXT:    v_or_b32_e32 v18, v0, v18
-; GFX6-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[16:17], v25
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v23
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v23
+; GFX6-NEXT:    v_not_b32_e32 v0, v16
+; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v0
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v19
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[17:18], v0
+; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v19
+; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v19
+; GFX6-NEXT:    v_lshl_b64 v[23:24], v[17:18], v19
+; GFX6-NEXT:    v_or_b32_e32 v21, v0, v21
+; GFX6-NEXT:    v_or_b32_e32 v22, v1, v22
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[17:18], v25
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v23, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v23, 0, v24, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v21, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v22, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX6-NEXT:    v_and_b32_e32 v22, 0x7f, v16
+; GFX6-NEXT:    v_cndmask_b32_e32 v19, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v21, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v22
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v22
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[10:11], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v23
+; GFX6-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v22
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v24
-; GFX6-NEXT:    v_lshr_b64 v[16:17], v[10:11], v23
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX6-NEXT:    v_not_b32_e32 v8, v20
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v22
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX6-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX6-NEXT:    v_lshr_b64 v[16:17], v[10:11], v22
 ; GFX6-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v19
+; GFX6-NEXT:    v_not_b32_e32 v4, v20
+; GFX6-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v18
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[8:9], v4
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v19
-; GFX6-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX6-NEXT:    v_subrev_i32_e32 v20, vcc, 64, v19
-; GFX6-NEXT:    v_lshl_b64 v[16:17], v[8:9], v19
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v18
+; GFX6-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v19, vcc, 64, v18
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[8:9], v18
 ; GFX6-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX6-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX6-NEXT:    v_lshl_b64 v[4:5], v[8:9], v20
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT:    v_lshl_b64 v[4:5], v[8:9], v19
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
 ; GFX6-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v18
@@ -7729,8 +7791,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v22, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v23, v1
+; GFX6-NEXT:    v_or_b32_e32 v3, v21, v3
 ; GFX6-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX6-NEXT:    v_or_b32_e32 v6, v10, v6
@@ -7740,68 +7802,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-LABEL: v_fshr_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX8-NEXT:    v_not_b32_e32 v16, v16
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX8-NEXT:    v_lshlrev_b64 v[16:17], 1, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[17:18], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v24
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[16:17]
-; GFX8-NEXT:    v_lshlrev_b64 v[18:19], v24, v[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v24
-; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v24, v[16:17]
-; GFX8-NEXT:    v_or_b32_e32 v18, v0, v18
-; GFX8-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[16:17]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v23
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v23, v[8:9]
+; GFX8-NEXT:    v_not_b32_e32 v0, v16
+; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v0
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v19
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[17:18]
+; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v19
+; GFX8-NEXT:    v_lshlrev_b64 v[23:24], v19, v[17:18]
+; GFX8-NEXT:    v_or_b32_e32 v21, v0, v21
+; GFX8-NEXT:    v_or_b32_e32 v22, v1, v22
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[17:18]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v23, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, 0, v24, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v21, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v22, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_and_b32_e32 v22, 0x7f, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v22
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v22, v[8:9]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX8-NEXT:    v_subrev_u32_e32 v24, vcc, 64, v23
+; GFX8-NEXT:    v_subrev_u32_e32 v24, vcc, 64, v22
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v23, v[10:11]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX8-NEXT:    v_not_b32_e32 v8, v20
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v22
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v22, v[10:11]
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v19
+; GFX8-NEXT:    v_not_b32_e32 v4, v20
+; GFX8-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v18
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v19, v[6:7]
-; GFX8-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX8-NEXT:    v_subrev_u32_e32 v20, vcc, 64, v19
-; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v19, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
+; GFX8-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, 64, v18
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v18, v[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX8-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v20, v[8:9]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v19, v[8:9]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v18
@@ -7820,8 +7882,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v22, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v23, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v21, v3
 ; GFX8-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX8-NEXT:    v_or_b32_e32 v6, v10, v6
@@ -7831,68 +7893,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-LABEL: v_fshr_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX9-NEXT:    v_not_b32_e32 v16, v16
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX9-NEXT:    v_lshlrev_b64 v[16:17], 1, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[17:18], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v24
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[16:17]
-; GFX9-NEXT:    v_lshlrev_b64 v[18:19], v24, v[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v24
-; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v24, v[16:17]
-; GFX9-NEXT:    v_or_b32_e32 v18, v0, v18
-; GFX9-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v25, v[16:17]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v23, v[8:9]
+; GFX9-NEXT:    v_not_b32_e32 v0, v16
+; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v19
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[17:18]
+; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v19
+; GFX9-NEXT:    v_lshlrev_b64 v[23:24], v19, v[17:18]
+; GFX9-NEXT:    v_or_b32_e32 v21, v0, v21
+; GFX9-NEXT:    v_or_b32_e32 v22, v1, v22
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v25, v[17:18]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v23, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, 0, v24, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v21, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v22, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT:    v_and_b32_e32 v22, 0x7f, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v0, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v22
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v22, v[8:9]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX9-NEXT:    v_subrev_u32_e32 v24, 64, v23
+; GFX9-NEXT:    v_subrev_u32_e32 v24, 64, v22
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v23, v[10:11]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX9-NEXT:    v_not_b32_e32 v8, v20
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v22
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX9-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v19
+; GFX9-NEXT:    v_not_b32_e32 v4, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v22, v[10:11]
+; GFX9-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v4
+; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v18
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v19, v[6:7]
-; GFX9-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX9-NEXT:    v_subrev_u32_e32 v20, 64, v19
-; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v19, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
+; GFX9-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v19, 64, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v18, v[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX9-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v20, v[8:9]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v19, v[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
 ; GFX9-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v18
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
@@ -7911,8 +7973,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v22, v1
+; GFX9-NEXT:    v_or_b32_e32 v1, v23, v1
+; GFX9-NEXT:    v_or_b32_e32 v3, v21, v3
 ; GFX9-NEXT:    v_or_b32_e32 v4, v16, v4
 ; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v10, v6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 404e726246f4d..81abe91b283f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -2787,52 +2787,51 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-LABEL: v_sdiv_v2i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT:    v_and_b32_e32 v4, 0xffffff, v6
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v5
+; CGP-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v6
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v5
 ; CGP-NEXT:    v_rcp_f32_e32 v1, v1
-; CGP-NEXT:    v_and_b32_e32 v7, 0xffffff, v0
+; CGP-NEXT:    v_rcp_f32_e32 v7, v3
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; CGP-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v5
-; CGP-NEXT:    v_rcp_f32_e32 v8, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
-; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v8
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT:    v_mov_b32_e32 v0, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; CGP-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v1
+; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0
+; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v0
+; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v0
+; CGP-NEXT:    v_mov_b32_e32 v0, v4
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v6
+; CGP-NEXT:    v_mul_lo_u32 v4, v1, v5
+; CGP-NEXT:    v_mul_lo_u32 v0, v0, v7
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v1
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v1, v8, vcc
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT:    v_mov_b32_e32 v5, v1
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v1, v5, v3
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; CGP-NEXT:    v_sub_i32_e64 v8, s[4:5], v3, v5
 ; CGP-NEXT:    v_mov_b32_e32 v0, v1
-; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v6, v0
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v7, v0
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
-; CGP-NEXT:    v_sub_i32_e64 v8, s[4:5], v7, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v3, v8, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v4
 ; CGP-NEXT:    v_mov_b32_e32 v7, v1
-; CGP-NEXT:    v_mul_lo_u32 v8, v7, v4
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT:    v_mul_lo_u32 v8, v7, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v7
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v2, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 3729f1cc2b12d..183f2edbf9035 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -563,18 +563,21 @@ define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0x3fff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_i32_zext_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0x3fff
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i32_zext_i16:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0x3fff
 ; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %and = and i16 %x, 16383
   %ext = zext i16 %and to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 5b94e71ecf52e..cfac0c2fa56aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3286,45 +3286,45 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-LABEL: v_srem_v2i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT:    v_and_b32_e32 v4, 0xffffff, v6
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v5
+; CGP-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v6
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v5
 ; CGP-NEXT:    v_rcp_f32_e32 v1, v1
-; CGP-NEXT:    v_and_b32_e32 v7, 0xffffff, v0
+; CGP-NEXT:    v_rcp_f32_e32 v7, v3
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; CGP-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v5
-; CGP-NEXT:    v_rcp_f32_e32 v8, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
-; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v8
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT:    v_mov_b32_e32 v0, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; CGP-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v1
+; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0
+; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v0
+; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v0
+; CGP-NEXT:    v_mov_b32_e32 v0, v4
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v6
+; CGP-NEXT:    v_mul_lo_u32 v0, v0, v7
+; CGP-NEXT:    v_mul_lo_u32 v4, v1, v5
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
 ; CGP-NEXT:    v_mov_b32_e32 v0, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v5, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v0, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v4
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v3, v5
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v4, v1, v6
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v5
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v4
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index e31d8e95bd608..1ee521b3dedac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -2147,26 +2147,26 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
-; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v4
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; CGP-NEXT:    v_rcp_f32_e32 v4, v2
+; CGP-NEXT:    v_rcp_f32_e32 v4, v1
 ; CGP-NEXT:    v_rcp_f32_e32 v5, v3
 ; CGP-NEXT:    v_mul_f32_e32 v4, v0, v4
-; CGP-NEXT:    v_mul_f32_e32 v5, v1, v5
+; CGP-NEXT:    v_mul_f32_e32 v5, v2, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v4, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v5, v5
-; CGP-NEXT:    v_mad_f32 v0, -v4, v2, v0
+; CGP-NEXT:    v_mad_f32 v0, -v4, v1, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_mad_f32 v1, -v5, v3, v1
+; CGP-NEXT:    v_mad_f32 v2, -v5, v3, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v0|, v2
+; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v0|, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, v3
+; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index f30b278b3e611..a7e5ce3d21619 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -2561,12 +2561,12 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
-; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v4
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v3
 ; CGP-NEXT:    v_rcp_f32_e32 v8, v5
 ; CGP-NEXT:    v_rcp_f32_e32 v9, v7
@@ -2584,10 +2584,10 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_mul_lo_u32 v2, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v1, v4, v1
 ; CGP-NEXT:    v_mul_lo_u32 v3, v5, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v2, v3
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v1
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index 9ea9fa91e4f92..1b35a89ad7f93 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -278,7 +278,6 @@ define amdgpu_ps i64 @s_csh_64_0(i64 inreg %a, i64 inreg %b) {
 ;
 ; GISEL-LABEL: s_csh_64_0:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], 63
 ; GISEL-NEXT:    s_lshl_b64 s[4:5], s[0:1], s2
 ; GISEL-NEXT:    s_lshr_b64 s[6:7], s[0:1], s2
 ; GISEL-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
@@ -310,7 +309,6 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) {
 ;
 ; GISEL-LABEL: s_csh_64_1:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], 0xff
 ; GISEL-NEXT:    s_lshl_b64 s[4:5], s[0:1], s2
 ; GISEL-NEXT:    s_lshr_b64 s[6:7], s[0:1], s2
 ; GISEL-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index a0b549711f339..93e14a205f05d 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1592,7 +1592,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
+; GFX10-GISEL-NEXT:    v_sub_nc_u16 v1, v1, 24
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
@@ -1837,7 +1837,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 25, v1
+; GFX10-GISEL-NEXT:    v_sub_nc_u16 v1, v1, 25
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0

From b88aced1abd8280e305d176c3cc5d85ae720ae50 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Wed, 11 Sep 2024 15:30:23 +0200
Subject: [PATCH 101/114] [flang][lowering] handle procedure pointers with
 generic name (#108043)

Handle procedure pointer with the same name as generics in lowering to avoid crashes after #107928.
---
 flang/lib/Lower/PFTBuilder.cpp                |  9 +++-
 .../HLFIR/procedure-pointer-in-generics.f90   | 46 +++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Lower/HLFIR/procedure-pointer-in-generics.f90

diff --git a/flang/lib/Lower/PFTBuilder.cpp b/flang/lib/Lower/PFTBuilder.cpp
index 5b3d5471925bf..793e291a168ad 100644
--- a/flang/lib/Lower/PFTBuilder.cpp
+++ b/flang/lib/Lower/PFTBuilder.cpp
@@ -1566,6 +1566,14 @@ struct SymbolDependenceAnalysis {
       return 0;
     LLVM_DEBUG(llvm::dbgs() << "analyze symbol " << &sym << " in <"
                             << &sym.owner() << ">: " << sym << '\n');
+    const semantics::Symbol &ultimate = sym.GetUltimate();
+    if (const auto *details = ultimate.detailsIf<semantics::GenericDetails>()) {
+      // Procedure pointers may be "hidden" behind to the generic symbol if they
+      // have the same name.
+      if (const semantics::Symbol *specific = details->specific())
+        analyze(*specific);
+      return 0;
+    }
     const bool isProcedurePointerOrDummy =
         semantics::IsProcedurePointer(sym) ||
         (semantics::IsProcedure(sym) && IsDummy(sym));
@@ -1582,7 +1590,6 @@ struct SymbolDependenceAnalysis {
     if (sym.owner().IsDerivedType())
       return 0;
 
-    semantics::Symbol ultimate = sym.GetUltimate();
     if (const auto *details =
             ultimate.detailsIf<semantics::NamelistDetails>()) {
       // handle namelist group symbols
diff --git a/flang/test/Lower/HLFIR/procedure-pointer-in-generics.f90 b/flang/test/Lower/HLFIR/procedure-pointer-in-generics.f90
new file mode 100644
index 0000000000000..ff447d31b1af1
--- /dev/null
+++ b/flang/test/Lower/HLFIR/procedure-pointer-in-generics.f90
@@ -0,0 +1,46 @@
+! Test procedure pointers with the same name as generics.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+module m_gen
+  procedure(func), pointer :: foo
+  interface foo
+     procedure :: foo
+  end interface
+  interface
+    real function func(x)
+      real :: x
+    end function
+  end interface
+end
+!CHECK-LABEL:   fir.global @_QMm_genEfoo : !fir.boxproc<(!fir.ref<f32>) -> f32> {
+!CHECK:           %[[VAL_0:.*]] = fir.zero_bits (!fir.ref<f32>) -> f32
+!CHECK:           %[[VAL_1:.*]] = fir.emboxproc %[[VAL_0]] : ((!fir.ref<f32>) -> f32) -> !fir.boxproc<(!fir.ref<f32>) -> f32>
+!CHECK:           fir.has_value %[[VAL_1]] : !fir.boxproc<(!fir.ref<f32>) -> f32>
+
+subroutine test1()
+  use m_gen
+  foo => func
+end subroutine
+!CHECK-LABEL:   func.func @_QPtest1() {
+!CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QMm_genEfoo) : !fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>
+!CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}}"_QMm_genEfoo"{{.*}} : (!fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>) -> (!fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>, !fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>)
+!CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QPfunc) : (!fir.ref<f32>) -> f32
+!CHECK:           %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : ((!fir.ref<f32>) -> f32) -> !fir.boxproc<() -> ()>
+!CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> !fir.boxproc<(!fir.ref<f32>) -> f32>
+!CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>
+
+subroutine test_local()
+  use m_gen, only : func
+  procedure(func), pointer :: foo
+  interface foo
+     procedure :: foo
+  end interface
+  foo => func
+end subroutine
+!CHECK-LABEL:   func.func @_QPtest_local() {
+!CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.boxproc<(!fir.ref<f32>) -> f32> {bindc_name = "foo", uniq_name = "_QFtest_localEfoo"}
+!CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}}"_QFtest_localEfoo"{{.*}} : (!fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>) -> (!fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>, !fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>)
+!CHECK:           %[[VAL_4:.*]] = fir.address_of(@_QPfunc) : (!fir.ref<f32>) -> f32
+!CHECK:           %[[VAL_5:.*]] = fir.emboxproc %[[VAL_4]] : ((!fir.ref<f32>) -> f32) -> !fir.boxproc<() -> ()>
+!CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.boxproc<() -> ()>) -> !fir.boxproc<(!fir.ref<f32>) -> f32>
+!CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]]#0 : !fir.ref<!fir.boxproc<(!fir.ref<f32>) -> f32>>

From 7be6ea124430c3461fb85588d6eb1af70930cfe7 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Sep 2024 06:39:30 -0700
Subject: [PATCH 102/114] [Dialect] Avoid repeated hash lookups (NFC) (#108137)

---
 mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index d51d63f243ea0..85604eef2f283 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -465,9 +465,8 @@ bool AnalysisState::isValueRead(Value value) const {
 
   while (!workingSet.empty()) {
     OpOperand *uMaybeReading = workingSet.pop_back_val();
-    if (visited.contains(uMaybeReading))
+    if (!visited.insert(uMaybeReading).second)
       continue;
-    visited.insert(uMaybeReading);
 
     // Skip over all ops that neither read nor write (but create an alias).
     if (bufferizesToAliasOnly(*uMaybeReading))

From 7dfaedf86127620821da7d31bf08fe1403f19ffe Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Sep 2024 06:39:58 -0700
Subject: [PATCH 103/114] [TableGen] Avoid repeated hash lookups (NFC)
 (#108138)

---
 llvm/utils/TableGen/DAGISelMatcherEmitter.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index 1b93e3d5e3b70..a14cc3d6b844c 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -71,9 +71,9 @@ class MatcherTableEmitter {
   MapVector<std::string, unsigned, StringMap<unsigned>> VecPatterns;
 
   unsigned getPatternIdxFromTable(std::string &&P, std::string &&include_loc) {
-    const auto It = VecPatterns.find(P);
-    if (It == VecPatterns.end()) {
-      VecPatterns.insert(std::pair(std::move(P), VecPatterns.size()));
+    const auto [It, Inserted] =
+        VecPatterns.try_emplace(std::move(P), VecPatterns.size());
+    if (Inserted) {
       VecIncludeStrings.push_back(std::move(include_loc));
       return VecIncludeStrings.size() - 1;
     }

From 4b1b450ae4b8fb9185f337c15315f4f473c3b429 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Sep 2024 06:40:17 -0700
Subject: [PATCH 104/114] [Transforms] Avoid repeated hash lookups (NFC)
 (#108139)

---
 mlir/lib/Transforms/Utils/DialectConversion.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 450e66f0db4e7..f288c7fc2cb77 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -2486,9 +2486,8 @@ static void eraseDeadUnrealizedCasts(
 
     // Do not visit ops multiple times. If we find a circle, no live user was
     // found on the current path.
-    if (visited.contains(op))
+    if (!visited.insert(op).second)
       return false;
-    visited.insert(op);
 
     // Visit all users.
     for (Operation *user : op->getUsers()) {

From 6ffa7cd8b04ab4b771925d329d7ee8788a3b00ca Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Sep 2024 06:40:37 -0700
Subject: [PATCH 105/114] [Interfaces] Avoid repeated hash lookups (NFC)
 (#108140)

---
 mlir/lib/Interfaces/ValueBoundsOpInterface.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index 6420c192b257d..505e84e3ca0cf 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -605,9 +605,8 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound(
     worklist.push_back(v);
     while (!worklist.empty()) {
       Value next = worklist.pop_back_val();
-      if (visited.contains(next))
+      if (!visited.insert(next).second)
         continue;
-      visited.insert(next);
       if (llvm::is_contained(independencies, next))
         return false;
       // TODO: DominanceInfo could be used to stop the traversal early.

From 01967e265889a9e242122087c41a97139e84bdc0 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 11 Sep 2024 14:55:14 +0100
Subject: [PATCH 106/114] [AMDGPU] Shrink a live interval instead of
 recomputing it. NFCI. (#108171)

---
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index f9d7ead4ff3ec..38ebda6cde1e5 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1527,13 +1527,18 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
   for (MachineInstr *MI : LowerToCopyInstrs) {
     LLVM_DEBUG(dbgs() << "simplify: " << *MI);
 
-    Register RecomputeReg = 0;
     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
       assert(MI->getNumExplicitOperands() == 3);
+
+      LiveInterval *RecomputeLI = nullptr;
       if (MI->getOperand(2).isReg())
-        RecomputeReg = MI->getOperand(2).getReg();
+        RecomputeLI = &LIS->getInterval(MI->getOperand(2).getReg());
+
       MI->removeOperand(2);
+
+      if (RecomputeLI)
+        LIS->shrinkToUses(RecomputeLI);
     } else {
       assert(MI->getNumExplicitOperands() == 2);
     }
@@ -1550,11 +1555,6 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
 
     MI->setDesc(TII->get(CopyOp));
     LLVM_DEBUG(dbgs() << " -> " << *MI);
-
-    if (RecomputeReg) {
-      LIS->removeInterval(RecomputeReg);
-      LIS->createAndComputeVirtRegInterval(RecomputeReg);
-    }
   }
   return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
 }

From 7a30b9c0f0c9523e29b449d80c695e6d8df0f176 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 11 Sep 2024 14:55:53 +0100
Subject: [PATCH 107/114] [AMDGPU] Make more use of getWaveMaskRegClass. NFC.
 (#108186)

---
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp      |  4 ++--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp       |  6 +++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp          | 14 ++++++--------
 llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp |  2 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp       |  6 ++----
 5 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 44872761760db..434336ef137ff 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1116,8 +1116,8 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
       Register SrcReg = MI.getOperand(1).getReg();
       Register DstReg = MI.getOperand(0).getReg();
       if (SrcReg == AMDGPU::SCC) {
-        Register SCCCopy = MRI->createVirtualRegister(
-            TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
+        Register SCCCopy =
+            MRI->createVirtualRegister(TRI->getWaveMaskRegClass());
         I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
                     MI.getDebugLoc(),
                     TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 736f714ac1a77..bbb1d0c5eba14 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4562,7 +4562,7 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock::iterator I(&MI);
 
-  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+  const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
   Register DstReg = MI.getOperand(0).getReg();
   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
   Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
@@ -5064,7 +5064,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
       return BB;
     }
 
-    const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+    const auto *CarryRC = TRI->getWaveMaskRegClass();
 
     Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -5296,7 +5296,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
 
     Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+    const auto *CondRC = TRI->getWaveMaskRegClass();
     Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
 
     const TargetRegisterClass *Src0RC = Src0.isReg()
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c6f28af1e5e73..87b213767b4fc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1231,8 +1231,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
                                      Register TrueReg,
                                      Register FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  const TargetRegisterClass *BoolXExecRC =
-    RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+  const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
          "Not a VGPR32 reg");
 
@@ -6417,7 +6416,7 @@ static void emitLoadScalarOpsFromVGPRLoop(
       ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
   unsigned AndOpc =
       ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+  const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
 
   MachineBasicBlock::iterator I = LoopBB.begin();
 
@@ -6565,7 +6564,7 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
   const DebugLoc &DL = MI.getDebugLoc();
   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+  const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
 
   // Save SCC. Waterfall Loop may overwrite SCC.
   Register SaveSCCReg;
@@ -6958,7 +6957,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
       Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
       Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
-      const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+      const auto *BoolXExecRC = RI.getWaveMaskRegClass();
       Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
       Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
 
@@ -7336,7 +7335,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
                        ? AMDGPU::V_ADDC_U32_e64
                        : AMDGPU::V_SUBB_U32_e64;
-    const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+    const auto *CarryRC = RI.getWaveMaskRegClass();
 
     Register CarryInReg = Inst.getOperand(4).getReg();
     if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
@@ -7711,8 +7710,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
 
   Register NewCondReg = CondReg;
   if (IsSCC) {
-    const TargetRegisterClass *TC =
-        RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+    const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
     NewCondReg = MRI.createVirtualRegister(TC);
 
     // Now look for the closest SCC def if it is a copy
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 1b52a48d068eb..23d04fae42015 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2014,7 +2014,7 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
   MachineOperand OffsetHi =
     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
 
-  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+  const auto *CarryRC = TRI->getWaveMaskRegClass();
   Register CarryReg = MRI->createVirtualRegister(CarryRC);
   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 4c571a36e4896..2d1cd1bda3afe 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3428,8 +3428,7 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
         std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
   case AMDGPU::VCCRegBankID:
     assert(Size == 1);
-    return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
-                    : &AMDGPU::SReg_64_XEXECRegClass;
+    return getWaveMaskRegClass();
   case AMDGPU::SGPRRegBankID:
     return getSGPRClassForBitWidth(std::max(32u, Size));
   case AMDGPU::AGPRRegBankID:
@@ -3472,8 +3471,7 @@ SIRegisterInfo::getRegClass(unsigned RCID) const {
   case AMDGPU::SReg_1RegClassID:
     return getBoolRC();
   case AMDGPU::SReg_1_XEXECRegClassID:
-    return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
-      : &AMDGPU::SReg_64_XEXECRegClass;
+    return getWaveMaskRegClass();
   case -1:
     return nullptr;
   default:

From ccc4fa18423f097a9f04d3198cacf094445ffd71 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 11 Sep 2024 06:57:31 -0700
Subject: [PATCH 108/114] [TableGen] Fix MacOS failure in Option Emitter.
 (#108225)

Handle the case of same pointer used as both inputs to the
`CompareOptionRecords`, to avoid emitting errors for equivalent options.

Follow-up to #107696.
---
 llvm/utils/TableGen/Common/OptEmitter.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/TableGen/Common/OptEmitter.cpp b/llvm/utils/TableGen/Common/OptEmitter.cpp
index 1c91ec5b3dbc4..75e32c36d4f72 100644
--- a/llvm/utils/TableGen/Common/OptEmitter.cpp
+++ b/llvm/utils/TableGen/Common/OptEmitter.cpp
@@ -41,6 +41,8 @@ static int StrCmpOptionName(const char *A, const char *B) {
 
 // Returns true if A is ordered before B.
 bool CompareOptionRecords(const Record *A, const Record *B) {
+  if (A == B)
+    return false;
   // Sentinel options precede all others and are only ordered by precedence.
   bool ASent = A->getValueAsDef("Kind")->getValueAsBit("Sentinel");
   bool BSent = B->getValueAsDef("Kind")->getValueAsBit("Sentinel");

From 35f7cfb22420a7c94b48e54fa28195ada9863d1a Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Wed, 11 Sep 2024 16:14:41 +0200
Subject: [PATCH 109/114] [clang][bytecode] Check for Pointer dereference in
 EvaluationResult (#108207)

We will deref<>() it later, so this is the right check.
---
 clang/lib/AST/ByteCode/EvaluationResult.cpp  |  4 ++--
 clang/test/AST/ByteCode/initializer_list.cpp | 20 ++++++++++++++++++--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/ByteCode/EvaluationResult.cpp b/clang/lib/AST/ByteCode/EvaluationResult.cpp
index bdebd19af9f94..627d4b2f65be9 100644
--- a/clang/lib/AST/ByteCode/EvaluationResult.cpp
+++ b/clang/lib/AST/ByteCode/EvaluationResult.cpp
@@ -178,8 +178,8 @@ bool EvaluationResult::checkFullyInitialized(InterpState &S,
 static void collectBlocks(const Pointer &Ptr,
                           llvm::SetVector<const Block *> &Blocks) {
   auto isUsefulPtr = [](const Pointer &P) -> bool {
-    return P.isLive() && !P.isZero() && !P.isDummy() &&
-           !P.isUnknownSizeArray() && !P.isOnePastEnd() && P.isBlockPointer();
+    return P.isLive() && !P.isZero() && !P.isDummy() && P.isDereferencable() &&
+           !P.isUnknownSizeArray() && !P.isOnePastEnd();
   };
 
   if (!isUsefulPtr(Ptr))
diff --git a/clang/test/AST/ByteCode/initializer_list.cpp b/clang/test/AST/ByteCode/initializer_list.cpp
index 4e3b8dc912016..f882e4ff1b124 100644
--- a/clang/test/AST/ByteCode/initializer_list.cpp
+++ b/clang/test/AST/ByteCode/initializer_list.cpp
@@ -1,8 +1,6 @@
 // RUN: %clang_cc1 -fexperimental-new-constant-interpreter -fms-extensions -std=c++20 -verify=expected,both %s
 // RUN: %clang_cc1 -std=c++20 -fms-extensions -verify=ref,both %s
 
-// both-no-diagnostics
-
 namespace std {
   typedef decltype(sizeof(int)) size_t;
   template <class _E>
@@ -53,3 +51,21 @@ constexpr int foo() {
 }
 
 static_assert(foo() == 0);
+
+
+namespace rdar13395022 {
+  struct MoveOnly { // both-note {{candidate}}
+    MoveOnly(MoveOnly&&); // both-note 2{{copy constructor is implicitly deleted because}} both-note {{candidate}}
+  };
+
+  void test(MoveOnly mo) {
+    auto &&list1 = {mo}; // both-error {{call to implicitly-deleted copy constructor}} both-note {{in initialization of temporary of type 'std::initializer_list}}
+    MoveOnly (&&list2)[1] = {mo}; // both-error {{call to implicitly-deleted copy constructor}} both-note {{in initialization of temporary of type 'MoveOnly[1]'}}
+    std::initializer_list<MoveOnly> &&list3 = {};
+    MoveOnly (&&list4)[1] = {}; // both-error {{no matching constructor}}
+    // both-note@-1 {{in implicit initialization of array element 0 with omitted initializer}}
+    // both-note@-2 {{in initialization of temporary of type 'MoveOnly[1]' created to list-initialize this reference}}
+  }
+}
+
+

From 43da8a7a10237e8cb89e6d776bec81d97b5326d1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 11 Sep 2024 15:18:16 +0100
Subject: [PATCH 110/114] [DAG] Add test coverage for ABD "sub of selects"
 patterns based off #53045

Add tests for "sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abd(a,b)" patterns that still fail to match to abd nodes

This will hopefully be helped by #108218
---
 llvm/test/CodeGen/AArch64/abds.ll |  84 +++++++
 llvm/test/CodeGen/AArch64/abdu.ll |  85 +++++++
 llvm/test/CodeGen/RISCV/abds.ll   | 404 ++++++++++++++++++++++++++++++
 llvm/test/CodeGen/RISCV/abdu.ll   | 393 ++++++++++++++++++++++++++++-
 llvm/test/CodeGen/X86/abds.ll     | 205 +++++++++++++++
 llvm/test/CodeGen/X86/abdu.ll     | 206 +++++++++++++++
 6 files changed, 1376 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/abds.ll b/llvm/test/CodeGen/AArch64/abds.ll
index 0e35f8240848b..e5cc04f9be1a1 100644
--- a/llvm/test/CodeGen/AArch64/abds.ll
+++ b/llvm/test/CodeGen/AArch64/abds.ll
@@ -539,6 +539,90 @@ define i64 @vector_legalized(i16 %a, i16 %b) {
   ret i64 %z
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: abd_select_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    cmp w8, w1, sxtb
+; CHECK-NEXT:    csel w8, w0, w1, lt
+; CHECK-NEXT:    csel w9, w1, w0, lt
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; CHECK-LABEL: abd_select_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sxth w8, w0
+; CHECK-NEXT:    cmp w8, w1, sxth
+; CHECK-NEXT:    csel w8, w0, w1, le
+; CHECK-NEXT:    csel w9, w1, w0, le
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: abd_select_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    csel w8, w0, w1, gt
+; CHECK-NEXT:    csel w9, w1, w0, gt
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: abd_select_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x1
+; CHECK-NEXT:    csel x8, x0, x1, ge
+; CHECK-NEXT:    csel x9, x1, x0, ge
+; CHECK-NEXT:    sub x0, x8, x9
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; CHECK-LABEL: abd_select_i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    sbcs xzr, x1, x3
+; CHECK-NEXT:    csel x8, x0, x2, lt
+; CHECK-NEXT:    csel x9, x2, x0, lt
+; CHECK-NEXT:    csel x10, x1, x3, lt
+; CHECK-NEXT:    csel x11, x3, x1, lt
+; CHECK-NEXT:    subs x0, x9, x8
+; CHECK-NEXT:    sbc x1, x11, x10
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
 
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
diff --git a/llvm/test/CodeGen/AArch64/abdu.ll b/llvm/test/CodeGen/AArch64/abdu.ll
index eb866e6a78a9b..0a44ae1688458 100644
--- a/llvm/test/CodeGen/AArch64/abdu.ll
+++ b/llvm/test/CodeGen/AArch64/abdu.ll
@@ -400,6 +400,91 @@ define i64 @vector_legalized(i16 %a, i16 %b) {
   ret i64 %z
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abdu(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: abd_select_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    cmp w8, w1, uxtb
+; CHECK-NEXT:    csel w8, w0, w1, lo
+; CHECK-NEXT:    csel w9, w1, w0, lo
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; CHECK-LABEL: abd_select_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    cmp w8, w1, uxth
+; CHECK-NEXT:    csel w8, w0, w1, ls
+; CHECK-NEXT:    csel w9, w1, w0, ls
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: abd_select_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    csel w8, w0, w1, hi
+; CHECK-NEXT:    csel w9, w1, w0, hi
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: abd_select_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x1
+; CHECK-NEXT:    csel x8, x0, x1, hs
+; CHECK-NEXT:    csel x9, x1, x0, hs
+; CHECK-NEXT:    sub x0, x8, x9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; CHECK-LABEL: abd_select_i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    sbcs xzr, x1, x3
+; CHECK-NEXT:    csel x8, x0, x2, lo
+; CHECK-NEXT:    csel x9, x2, x0, lo
+; CHECK-NEXT:    csel x10, x1, x3, lo
+; CHECK-NEXT:    csel x11, x3, x1, lo
+; CHECK-NEXT:    subs x0, x9, x8
+; CHECK-NEXT:    sbc x1, x11, x10
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
+
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
 declare i32 @llvm.abs.i32(i32, i1)
diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index 86b36d8f69e95..919214b0e9a8d 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -2341,6 +2341,410 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind {
   ret i32 %abs
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; RV32I-LABEL: abd_select_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a1, 24
+; RV32I-NEXT:    srai a2, a2, 24
+; RV32I-NEXT:    slli a3, a0, 24
+; RV32I-NEXT:    srai a3, a3, 24
+; RV32I-NEXT:    blt a3, a2, .LBB34_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB34_2:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a2, a1, 56
+; RV64I-NEXT:    srai a2, a2, 56
+; RV64I-NEXT:    slli a3, a0, 56
+; RV64I-NEXT:    srai a3, a3, 56
+; RV64I-NEXT:    blt a3, a2, .LBB34_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB34_2:
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; ZBB-LABEL: abd_select_i8:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    sext.b a1, a1
+; ZBB-NEXT:    sext.b a0, a0
+; ZBB-NEXT:    min a2, a0, a1
+; ZBB-NEXT:    max a0, a0, a1
+; ZBB-NEXT:    sub a0, a0, a2
+; ZBB-NEXT:    ret
+  %cmp = icmp slt i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; RV32I-LABEL: abd_select_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srai a2, a2, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    srai a3, a3, 16
+; RV32I-NEXT:    bge a3, a2, .LBB35_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB35_2:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a2, a0, 48
+; RV64I-NEXT:    srai a2, a2, 48
+; RV64I-NEXT:    slli a3, a1, 48
+; RV64I-NEXT:    srai a3, a3, 48
+; RV64I-NEXT:    bge a3, a2, .LBB35_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB35_2:
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; ZBB-LABEL: abd_select_i16:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    sext.h a1, a1
+; ZBB-NEXT:    sext.h a0, a0
+; ZBB-NEXT:    min a2, a0, a1
+; ZBB-NEXT:    max a0, a0, a1
+; ZBB-NEXT:    sub a0, a0, a2
+; ZBB-NEXT:    ret
+  %cmp = icmp sle i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: abd_select_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    blt a1, a0, .LBB36_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB36_2:
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    sext.w a3, a1
+; RV64I-NEXT:    blt a3, a2, .LBB36_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB36_2:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    min a2, a0, a1
+; RV32ZBB-NEXT:    max a0, a0, a1
+; RV32ZBB-NEXT:    sub a0, a0, a2
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.w a1, a1
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    min a2, a0, a1
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: abd_select_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB37_3
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a1, a3
+; RV32I-NEXT:    bnez a4, .LBB37_4
+; RV32I-NEXT:  .LBB37_2:
+; RV32I-NEXT:    mv a4, a1
+; RV32I-NEXT:    mv a5, a0
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    j .LBB37_5
+; RV32I-NEXT:  .LBB37_3:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    beqz a4, .LBB37_2
+; RV32I-NEXT:  .LBB37_4:
+; RV32I-NEXT:    mv a4, a3
+; RV32I-NEXT:    mv a5, a2
+; RV32I-NEXT:  .LBB37_5:
+; RV32I-NEXT:    sltu a2, a5, a0
+; RV32I-NEXT:    sub a1, a4, a1
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a0, a5, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bge a0, a1, .LBB37_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB37_2:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    sltu a4, a2, a0
+; RV32ZBB-NEXT:    mv a5, a4
+; RV32ZBB-NEXT:    beq a1, a3, .LBB37_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt a5, a3, a1
+; RV32ZBB-NEXT:  .LBB37_2:
+; RV32ZBB-NEXT:    bnez a5, .LBB37_4
+; RV32ZBB-NEXT:  # %bb.3:
+; RV32ZBB-NEXT:    sub a1, a3, a1
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sub a0, a2, a0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB37_4:
+; RV32ZBB-NEXT:    sltu a4, a0, a2
+; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sub a0, a0, a2
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    min a2, a0, a1
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp sge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; RV32I-LABEL: abd_select_i128:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lw a7, 4(a2)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw t0, 12(a2)
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    beq a5, t0, .LBB38_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt t1, a5, t0
+; RV32I-NEXT:    j .LBB38_3
+; RV32I-NEXT:  .LBB38_2:
+; RV32I-NEXT:    sltu t1, a4, a6
+; RV32I-NEXT:  .LBB38_3:
+; RV32I-NEXT:    lw t3, 0(a2)
+; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    beq a3, a7, .LBB38_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    sltu a2, a3, a7
+; RV32I-NEXT:    j .LBB38_6
+; RV32I-NEXT:  .LBB38_5:
+; RV32I-NEXT:    sltu a2, a1, t3
+; RV32I-NEXT:  .LBB38_6:
+; RV32I-NEXT:    xor t2, a5, t0
+; RV32I-NEXT:    xor t4, a4, a6
+; RV32I-NEXT:    or t2, t4, t2
+; RV32I-NEXT:    beqz t2, .LBB38_8
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    mv a2, t1
+; RV32I-NEXT:  .LBB38_8:
+; RV32I-NEXT:    bnez a2, .LBB38_10
+; RV32I-NEXT:  # %bb.9:
+; RV32I-NEXT:    mv a2, t3
+; RV32I-NEXT:    mv t1, a7
+; RV32I-NEXT:    mv t4, t0
+; RV32I-NEXT:    mv t2, a6
+; RV32I-NEXT:    j .LBB38_11
+; RV32I-NEXT:  .LBB38_10:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv t1, a3
+; RV32I-NEXT:    mv t4, a5
+; RV32I-NEXT:    mv t2, a4
+; RV32I-NEXT:    mv a1, t3
+; RV32I-NEXT:    mv a3, a7
+; RV32I-NEXT:    mv a5, t0
+; RV32I-NEXT:    mv a4, a6
+; RV32I-NEXT:  .LBB38_11:
+; RV32I-NEXT:    sltu a6, a4, t2
+; RV32I-NEXT:    sub a7, a5, t4
+; RV32I-NEXT:    sltu a5, a1, a2
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    mv a7, a5
+; RV32I-NEXT:    beq a3, t1, .LBB38_13
+; RV32I-NEXT:  # %bb.12:
+; RV32I-NEXT:    sltu a7, a3, t1
+; RV32I-NEXT:  .LBB38_13:
+; RV32I-NEXT:    sub a4, a4, t2
+; RV32I-NEXT:    sltu t0, a4, a7
+; RV32I-NEXT:    sub a6, a6, t0
+; RV32I-NEXT:    sub a4, a4, a7
+; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    sub a3, a3, a5
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a6, 12(a0)
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i128:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beq a1, a3, .LBB38_3
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slt a4, a1, a3
+; RV64I-NEXT:    beqz a4, .LBB38_4
+; RV64I-NEXT:  .LBB38_2:
+; RV64I-NEXT:    mv a4, a1
+; RV64I-NEXT:    mv a5, a0
+; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    j .LBB38_5
+; RV64I-NEXT:  .LBB38_3:
+; RV64I-NEXT:    sltu a4, a0, a2
+; RV64I-NEXT:    bnez a4, .LBB38_2
+; RV64I-NEXT:  .LBB38_4:
+; RV64I-NEXT:    mv a4, a3
+; RV64I-NEXT:    mv a5, a2
+; RV64I-NEXT:  .LBB38_5:
+; RV64I-NEXT:    sltu a2, a0, a5
+; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    sub a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a5
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i128:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a4, 4(a1)
+; RV32ZBB-NEXT:    lw a6, 8(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a1, 4(a2)
+; RV32ZBB-NEXT:    sltu a2, a7, a6
+; RV32ZBB-NEXT:    mv t4, a2
+; RV32ZBB-NEXT:    beq t0, t1, .LBB38_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt t4, t1, t0
+; RV32ZBB-NEXT:  .LBB38_2:
+; RV32ZBB-NEXT:    sltu t2, a5, a3
+; RV32ZBB-NEXT:    sltu t5, a1, a4
+; RV32ZBB-NEXT:    mv t3, t2
+; RV32ZBB-NEXT:    beq a4, a1, .LBB38_4
+; RV32ZBB-NEXT:  # %bb.3:
+; RV32ZBB-NEXT:    mv t3, t5
+; RV32ZBB-NEXT:  .LBB38_4:
+; RV32ZBB-NEXT:    addi sp, sp, -16
+; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZBB-NEXT:    xor t6, t0, t1
+; RV32ZBB-NEXT:    xor s0, a6, a7
+; RV32ZBB-NEXT:    or t6, s0, t6
+; RV32ZBB-NEXT:    beqz t6, .LBB38_6
+; RV32ZBB-NEXT:  # %bb.5:
+; RV32ZBB-NEXT:    mv t3, t4
+; RV32ZBB-NEXT:  .LBB38_6:
+; RV32ZBB-NEXT:    mv t4, t2
+; RV32ZBB-NEXT:    beq a1, a4, .LBB38_8
+; RV32ZBB-NEXT:  # %bb.7:
+; RV32ZBB-NEXT:    mv t4, t5
+; RV32ZBB-NEXT:  .LBB38_8:
+; RV32ZBB-NEXT:    sltu t5, a3, a5
+; RV32ZBB-NEXT:    mv t6, t5
+; RV32ZBB-NEXT:    beq a4, a1, .LBB38_10
+; RV32ZBB-NEXT:  # %bb.9:
+; RV32ZBB-NEXT:    sltu t6, a4, a1
+; RV32ZBB-NEXT:  .LBB38_10:
+; RV32ZBB-NEXT:    bnez t3, .LBB38_12
+; RV32ZBB-NEXT:  # %bb.11:
+; RV32ZBB-NEXT:    sub t0, t1, t0
+; RV32ZBB-NEXT:    sub a6, a7, a6
+; RV32ZBB-NEXT:    sub a2, t0, a2
+; RV32ZBB-NEXT:    sltu a7, a6, t4
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a3, a5, a3
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, a6, t4
+; RV32ZBB-NEXT:    j .LBB38_13
+; RV32ZBB-NEXT:  .LBB38_12:
+; RV32ZBB-NEXT:    sltu a2, a6, a7
+; RV32ZBB-NEXT:    sub t0, t0, t1
+; RV32ZBB-NEXT:    sub a2, t0, a2
+; RV32ZBB-NEXT:    sub a6, a6, a7
+; RV32ZBB-NEXT:    sltu a7, a6, t6
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a3, a3, a5
+; RV32ZBB-NEXT:    sub a4, a4, a1
+; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a4, a6, t6
+; RV32ZBB-NEXT:  .LBB38_13:
+; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a3, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZBB-NEXT:    addi sp, sp, 16
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i128:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sltu a4, a2, a0
+; RV64ZBB-NEXT:    mv a5, a4
+; RV64ZBB-NEXT:    beq a1, a3, .LBB38_2
+; RV64ZBB-NEXT:  # %bb.1:
+; RV64ZBB-NEXT:    slt a5, a3, a1
+; RV64ZBB-NEXT:  .LBB38_2:
+; RV64ZBB-NEXT:    bnez a5, .LBB38_4
+; RV64ZBB-NEXT:  # %bb.3:
+; RV64ZBB-NEXT:    sub a1, a3, a1
+; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    sub a0, a2, a0
+; RV64ZBB-NEXT:    ret
+; RV64ZBB-NEXT:  .LBB38_4:
+; RV64ZBB-NEXT:    sltu a4, a0, a2
+; RV64ZBB-NEXT:    sub a1, a1, a3
+; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp slt i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
 
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
diff --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll
index 14f45895754df..a9f933243f679 100644
--- a/llvm/test/CodeGen/RISCV/abdu.ll
+++ b/llvm/test/CodeGen/RISCV/abdu.ll
@@ -1720,6 +1720,398 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
   ret i128 %sel
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abdu(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; NOZBB-LABEL: abd_select_i8:
+; NOZBB:       # %bb.0:
+; NOZBB-NEXT:    andi a2, a1, 255
+; NOZBB-NEXT:    andi a3, a0, 255
+; NOZBB-NEXT:    bltu a3, a2, .LBB23_2
+; NOZBB-NEXT:  # %bb.1:
+; NOZBB-NEXT:    sub a0, a0, a1
+; NOZBB-NEXT:    ret
+; NOZBB-NEXT:  .LBB23_2:
+; NOZBB-NEXT:    sub a0, a1, a0
+; NOZBB-NEXT:    ret
+;
+; ZBB-LABEL: abd_select_i8:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    andi a1, a1, 255
+; ZBB-NEXT:    andi a0, a0, 255
+; ZBB-NEXT:    minu a2, a0, a1
+; ZBB-NEXT:    maxu a0, a0, a1
+; ZBB-NEXT:    sub a0, a0, a2
+; ZBB-NEXT:    ret
+  %cmp = icmp ult i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; RV32I-LABEL: abd_select_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    addi a2, a2, -1
+; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    and a2, a1, a2
+; RV32I-NEXT:    bgeu a2, a3, .LBB24_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB24_2:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    and a2, a1, a2
+; RV64I-NEXT:    bgeu a2, a3, .LBB24_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB24_2:
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; ZBB-LABEL: abd_select_i16:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    zext.h a1, a1
+; ZBB-NEXT:    zext.h a0, a0
+; ZBB-NEXT:    minu a2, a0, a1
+; ZBB-NEXT:    maxu a0, a0, a1
+; ZBB-NEXT:    sub a0, a0, a2
+; ZBB-NEXT:    ret
+  %cmp = icmp ule i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: abd_select_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    bltu a1, a0, .LBB25_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB25_2:
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    sext.w a3, a1
+; RV64I-NEXT:    bltu a3, a2, .LBB25_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB25_2:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    minu a2, a0, a1
+; RV32ZBB-NEXT:    maxu a0, a0, a1
+; RV32ZBB-NEXT:    sub a0, a0, a2
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    slli a1, a1, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
+; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a0, a0, 32
+; RV64ZBB-NEXT:    minu a2, a0, a1
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: abd_select_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB26_3
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a1, a3
+; RV32I-NEXT:    bnez a4, .LBB26_4
+; RV32I-NEXT:  .LBB26_2:
+; RV32I-NEXT:    mv a4, a1
+; RV32I-NEXT:    mv a5, a0
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    j .LBB26_5
+; RV32I-NEXT:  .LBB26_3:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    beqz a4, .LBB26_2
+; RV32I-NEXT:  .LBB26_4:
+; RV32I-NEXT:    mv a4, a3
+; RV32I-NEXT:    mv a5, a2
+; RV32I-NEXT:  .LBB26_5:
+; RV32I-NEXT:    sltu a2, a5, a0
+; RV32I-NEXT:    sub a1, a4, a1
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a0, a5, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bgeu a0, a1, .LBB26_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB26_2:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    sltu a4, a0, a2
+; RV32ZBB-NEXT:    sub a3, a1, a3
+; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sub a2, a0, a2
+; RV32ZBB-NEXT:    beq a3, a1, .LBB26_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu a0, a1, a3
+; RV32ZBB-NEXT:    j .LBB26_3
+; RV32ZBB-NEXT:  .LBB26_2:
+; RV32ZBB-NEXT:    sltu a0, a0, a2
+; RV32ZBB-NEXT:  .LBB26_3:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    xor a2, a2, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a1
+; RV32ZBB-NEXT:    xor a1, a3, a1
+; RV32ZBB-NEXT:    add a1, a1, a0
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    minu a2, a0, a1
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp uge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; RV32I-LABEL: abd_select_i128:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lw a7, 4(a2)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw t0, 12(a2)
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    beq a5, t0, .LBB27_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu t1, a5, t0
+; RV32I-NEXT:    j .LBB27_3
+; RV32I-NEXT:  .LBB27_2:
+; RV32I-NEXT:    sltu t1, a4, a6
+; RV32I-NEXT:  .LBB27_3:
+; RV32I-NEXT:    lw t3, 0(a2)
+; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    beq a3, a7, .LBB27_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    sltu a2, a3, a7
+; RV32I-NEXT:    j .LBB27_6
+; RV32I-NEXT:  .LBB27_5:
+; RV32I-NEXT:    sltu a2, a1, t3
+; RV32I-NEXT:  .LBB27_6:
+; RV32I-NEXT:    xor t2, a5, t0
+; RV32I-NEXT:    xor t4, a4, a6
+; RV32I-NEXT:    or t2, t4, t2
+; RV32I-NEXT:    beqz t2, .LBB27_8
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    mv a2, t1
+; RV32I-NEXT:  .LBB27_8:
+; RV32I-NEXT:    bnez a2, .LBB27_10
+; RV32I-NEXT:  # %bb.9:
+; RV32I-NEXT:    mv a2, t3
+; RV32I-NEXT:    mv t1, a7
+; RV32I-NEXT:    mv t4, t0
+; RV32I-NEXT:    mv t2, a6
+; RV32I-NEXT:    j .LBB27_11
+; RV32I-NEXT:  .LBB27_10:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv t1, a3
+; RV32I-NEXT:    mv t4, a5
+; RV32I-NEXT:    mv t2, a4
+; RV32I-NEXT:    mv a1, t3
+; RV32I-NEXT:    mv a3, a7
+; RV32I-NEXT:    mv a5, t0
+; RV32I-NEXT:    mv a4, a6
+; RV32I-NEXT:  .LBB27_11:
+; RV32I-NEXT:    sltu a6, a4, t2
+; RV32I-NEXT:    sub a7, a5, t4
+; RV32I-NEXT:    sltu a5, a1, a2
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    mv a7, a5
+; RV32I-NEXT:    beq a3, t1, .LBB27_13
+; RV32I-NEXT:  # %bb.12:
+; RV32I-NEXT:    sltu a7, a3, t1
+; RV32I-NEXT:  .LBB27_13:
+; RV32I-NEXT:    sub a4, a4, t2
+; RV32I-NEXT:    sltu t0, a4, a7
+; RV32I-NEXT:    sub a6, a6, t0
+; RV32I-NEXT:    sub a4, a4, a7
+; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    sub a3, a3, a5
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a6, 12(a0)
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: abd_select_i128:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beq a1, a3, .LBB27_3
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sltu a4, a1, a3
+; RV64I-NEXT:    beqz a4, .LBB27_4
+; RV64I-NEXT:  .LBB27_2:
+; RV64I-NEXT:    mv a4, a1
+; RV64I-NEXT:    mv a5, a0
+; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    j .LBB27_5
+; RV64I-NEXT:  .LBB27_3:
+; RV64I-NEXT:    sltu a4, a0, a2
+; RV64I-NEXT:    bnez a4, .LBB27_2
+; RV64I-NEXT:  .LBB27_4:
+; RV64I-NEXT:    mv a4, a3
+; RV64I-NEXT:    mv a5, a2
+; RV64I-NEXT:  .LBB27_5:
+; RV64I-NEXT:    sltu a2, a0, a5
+; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    sub a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a5
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: abd_select_i128:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a1)
+; RV32ZBB-NEXT:    lw t0, 4(a2)
+; RV32ZBB-NEXT:    lw a1, 4(a1)
+; RV32ZBB-NEXT:    sltu a2, a4, a7
+; RV32ZBB-NEXT:    sub t1, a6, t1
+; RV32ZBB-NEXT:    sltu t2, a3, a5
+; RV32ZBB-NEXT:    sub a2, t1, a2
+; RV32ZBB-NEXT:    mv t1, t2
+; RV32ZBB-NEXT:    beq a1, t0, .LBB27_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu t1, a1, t0
+; RV32ZBB-NEXT:  .LBB27_2:
+; RV32ZBB-NEXT:    sub a7, a4, a7
+; RV32ZBB-NEXT:    sltu t3, a7, t1
+; RV32ZBB-NEXT:    sub a2, a2, t3
+; RV32ZBB-NEXT:    sub a7, a7, t1
+; RV32ZBB-NEXT:    beq a2, a6, .LBB27_4
+; RV32ZBB-NEXT:  # %bb.3:
+; RV32ZBB-NEXT:    sltu t1, a6, a2
+; RV32ZBB-NEXT:    j .LBB27_5
+; RV32ZBB-NEXT:  .LBB27_4:
+; RV32ZBB-NEXT:    sltu t1, a4, a7
+; RV32ZBB-NEXT:  .LBB27_5:
+; RV32ZBB-NEXT:    sub t0, a1, t0
+; RV32ZBB-NEXT:    sub t0, t0, t2
+; RV32ZBB-NEXT:    sub a5, a3, a5
+; RV32ZBB-NEXT:    beq t0, a1, .LBB27_7
+; RV32ZBB-NEXT:  # %bb.6:
+; RV32ZBB-NEXT:    sltu a1, a1, t0
+; RV32ZBB-NEXT:    j .LBB27_8
+; RV32ZBB-NEXT:  .LBB27_7:
+; RV32ZBB-NEXT:    sltu a1, a3, a5
+; RV32ZBB-NEXT:  .LBB27_8:
+; RV32ZBB-NEXT:    xor a3, a2, a6
+; RV32ZBB-NEXT:    xor a4, a7, a4
+; RV32ZBB-NEXT:    or a3, a4, a3
+; RV32ZBB-NEXT:    beqz a3, .LBB27_10
+; RV32ZBB-NEXT:  # %bb.9:
+; RV32ZBB-NEXT:    mv a1, t1
+; RV32ZBB-NEXT:  .LBB27_10:
+; RV32ZBB-NEXT:    neg a6, a1
+; RV32ZBB-NEXT:    xor a3, a7, a6
+; RV32ZBB-NEXT:    sltu a4, a3, a6
+; RV32ZBB-NEXT:    xor a2, a2, a6
+; RV32ZBB-NEXT:    add a2, a2, a1
+; RV32ZBB-NEXT:    sub a4, a2, a4
+; RV32ZBB-NEXT:    xor a2, a5, a6
+; RV32ZBB-NEXT:    sltu a5, a2, a6
+; RV32ZBB-NEXT:    xor a7, t0, a6
+; RV32ZBB-NEXT:    mv t1, a5
+; RV32ZBB-NEXT:    beqz t0, .LBB27_12
+; RV32ZBB-NEXT:  # %bb.11:
+; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:  .LBB27_12:
+; RV32ZBB-NEXT:    add a3, a3, a1
+; RV32ZBB-NEXT:    sltu a6, a3, t1
+; RV32ZBB-NEXT:    sub a4, a4, a6
+; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    add a7, a7, a1
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    sw a1, 0(a0)
+; RV32ZBB-NEXT:    sw a5, 4(a0)
+; RV32ZBB-NEXT:    sw a3, 8(a0)
+; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: abd_select_i128:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sltu a4, a0, a2
+; RV64ZBB-NEXT:    sub a3, a1, a3
+; RV64ZBB-NEXT:    sub a3, a3, a4
+; RV64ZBB-NEXT:    sub a2, a0, a2
+; RV64ZBB-NEXT:    beq a3, a1, .LBB27_2
+; RV64ZBB-NEXT:  # %bb.1:
+; RV64ZBB-NEXT:    sltu a0, a1, a3
+; RV64ZBB-NEXT:    j .LBB27_3
+; RV64ZBB-NEXT:  .LBB27_2:
+; RV64ZBB-NEXT:    sltu a0, a0, a2
+; RV64ZBB-NEXT:  .LBB27_3:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    xor a2, a2, a1
+; RV64ZBB-NEXT:    sltu a4, a2, a1
+; RV64ZBB-NEXT:    xor a1, a3, a1
+; RV64ZBB-NEXT:    add a1, a1, a0
+; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    add a0, a2, a0
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp ult i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
+
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
 declare i32 @llvm.abs.i32(i32, i1)
@@ -1737,4 +2129,3 @@ declare i32 @llvm.umin.i32(i32, i32)
 declare i64 @llvm.umin.i64(i64, i64)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}
-; NOZBB: {{.*}}
diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll
index 9c4c059a3b9bf..4c524c28b160a 100644
--- a/llvm/test/CodeGen/X86/abds.ll
+++ b/llvm/test/CodeGen/X86/abds.ll
@@ -1154,6 +1154,211 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind {
   ret i32 %abs
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; X86-LABEL: abd_select_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    subb %dl, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cmpb %sil, %al
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    cmovll %edi, %ecx
+; X64-NEXT:    cmovll %esi, %eax
+; X64-NEXT:    subb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %cmp = icmp slt i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; X86-LABEL: abd_select_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    cmovlel %eax, %edx
+; X86-NEXT:    cmovlel %ecx, %eax
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cmpw %si, %ax
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    cmovlel %edi, %ecx
+; X64-NEXT:    cmovlel %esi, %eax
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %cmp = icmp sle i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; X86-LABEL: abd_select_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmovgl %edx, %eax
+; X86-NEXT:    cmovgl %ecx, %edx
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmovgl %edi, %eax
+; X64-NEXT:    cmovgl %esi, %edi
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    retq
+  %cmp = icmp sgt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; X86-LABEL: abd_select_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %esi, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    cmovgel %ecx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    cmovgel %ebx, %eax
+; X86-NEXT:    cmovgel %edi, %ecx
+; X86-NEXT:    cmovgel %esi, %ebx
+; X86-NEXT:    subl %ebx, %eax
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmovgeq %rdi, %rax
+; X64-NEXT:    cmovgeq %rsi, %rdi
+; X64-NEXT:    subq %rdi, %rax
+; X64-NEXT:    retq
+  %cmp = icmp sge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; X86-LABEL: abd_select_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    cmovll %ebx, %edi
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    cmovll %ecx, %ebx
+; X86-NEXT:    cmovll %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    cmovll %esi, %ebp
+; X86-NEXT:    cmovll %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmovll %edx, %eax
+; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    sbbl %ebp, %esi
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: abd_select_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    sbbq %rcx, %rdi
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    cmovlq %rsi, %rdi
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    cmovlq %rax, %r8
+; X64-NEXT:    cmovlq %rcx, %rsi
+; X64-NEXT:    cmovlq %rdx, %rax
+; X64-NEXT:    subq %r8, %rax
+; X64-NEXT:    sbbq %rdi, %rsi
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    retq
+  %cmp = icmp slt i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
 
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index 335fa8c156f8e..fe9006a8aec23 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -768,6 +768,212 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
   ret i128 %sel
 }
 
+;
+; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abdu(a,b)
+;
+
+define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
+; X86-LABEL: abd_select_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    subb %dl, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cmpb %sil, %al
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    cmovbl %edi, %ecx
+; X64-NEXT:    cmovbl %esi, %eax
+; X64-NEXT:    subb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %cmp = icmp ult i8 %a, %b
+  %ab = select i1 %cmp, i8 %a, i8 %b
+  %ba = select i1 %cmp, i8 %b, i8 %a
+  %sub = sub i8 %ba, %ab
+  ret i8 %sub
+}
+
+define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
+; X86-LABEL: abd_select_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    cmovbel %eax, %edx
+; X86-NEXT:    cmovbel %ecx, %eax
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cmpw %si, %ax
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    cmovbel %edi, %ecx
+; X64-NEXT:    cmovbel %esi, %eax
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %cmp = icmp ule i16 %a, %b
+  %ab = select i1 %cmp, i16 %a, i16 %b
+  %ba = select i1 %cmp, i16 %b, i16 %a
+  %sub = sub i16 %ba, %ab
+  ret i16 %sub
+}
+
+define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
+; X86-LABEL: abd_select_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmoval %edx, %eax
+; X86-NEXT:    cmoval %ecx, %edx
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmoval %edi, %eax
+; X64-NEXT:    cmoval %esi, %edi
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    retq
+  %cmp = icmp ugt i32 %a, %b
+  %ab = select i1 %cmp, i32 %a, i32 %b
+  %ba = select i1 %cmp, i32 %b, i32 %a
+  %sub = sub i32 %ab, %ba
+  ret i32 %sub
+}
+
+define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
+; X86-LABEL: abd_select_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %esi, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    cmovael %ecx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    cmovael %ebx, %eax
+; X86-NEXT:    cmovael %edi, %ecx
+; X86-NEXT:    cmovael %esi, %ebx
+; X86-NEXT:    subl %ebx, %eax
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: abd_select_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmovaeq %rdi, %rax
+; X64-NEXT:    cmovaeq %rsi, %rdi
+; X64-NEXT:    subq %rdi, %rax
+; X64-NEXT:    retq
+  %cmp = icmp uge i64 %a, %b
+  %ab = select i1 %cmp, i64 %a, i64 %b
+  %ba = select i1 %cmp, i64 %b, i64 %a
+  %sub = sub i64 %ab, %ba
+  ret i64 %sub
+}
+
+define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
+; X86-LABEL: abd_select_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    cmovbl %ebx, %edi
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    cmovbl %ecx, %ebx
+; X86-NEXT:    cmovbl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    cmovbl %esi, %ebp
+; X86-NEXT:    cmovbl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmovbl %edx, %eax
+; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    sbbl %ebp, %esi
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: abd_select_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    sbbq %rcx, %rdi
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    cmovbq %rsi, %rdi
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    cmovbq %rax, %r8
+; X64-NEXT:    cmovbq %rcx, %rsi
+; X64-NEXT:    cmovbq %rdx, %rax
+; X64-NEXT:    subq %r8, %rax
+; X64-NEXT:    sbbq %rdi, %rsi
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    retq
+  %cmp = icmp ult i128 %a, %b
+  %ab = select i1 %cmp, i128 %a, i128 %b
+  %ba = select i1 %cmp, i128 %b, i128 %a
+  %sub = sub i128 %ba, %ab
+  ret i128 %sub
+}
+
 declare i8 @llvm.abs.i8(i8, i1)
 declare i16 @llvm.abs.i16(i16, i1)
 declare i32 @llvm.abs.i32(i32, i1)

From ee61a4db3c6d21fcbdccd74606e3c050052b8e7c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 11 Sep 2024 17:19:11 +0400
Subject: [PATCH 111/114] AMDGPU: Add tests for minimumnum/maximumnum
 intrinsics

Vector cases are broken, so leave those for later.
---
 llvm/test/CodeGen/AMDGPU/maximumnum.ll | 1736 ++++++++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/minimumnum.ll | 1690 +++++++++++++++++++++++
 2 files changed, 3426 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/maximumnum.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/minimumnum.ll

diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
new file mode 100644
index 0000000000000..506f40516c9e6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -0,0 +1,1736 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define half @v_maximumnum_f16(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call half @llvm.maximumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_nnan(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan half @llvm.maximumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_1.0(half %x) {
+; GFX8-LABEL: v_maximumnum_f16_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call half @llvm.maximumnum.f16(half %x, half 1.0)
+  ret half %result
+}
+
+define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
+; GFX8-LABEL: v_maximumnum_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GFX12-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
+  ret bfloat %result
+}
+
+define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
+; GFX8-LABEL: v_maximumnum_bf16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_bf16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_bf16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_bf16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_bf16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
+  ret bfloat %result
+}
+
+define float @v_maximumnum_f32(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_nnan(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define double @v_maximumnum_f64(double %x, double %y) {
+; GFX8-LABEL: v_maximumnum_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call double @llvm.maximumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_maximumnum_f64_nnan(double %x, double %y) {
+; GFX8-LABEL: v_maximumnum_f64_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan double @llvm.maximumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define float @v_maximumnum_f32_1.0(float %x) {
+; GFX8-LABEL: v_maximumnum_f32_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.maximumnum.f32(float %x, float 1.0)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_rhs_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.y = call float @llvm.canonicalize.f32(float %y)
+  %result = call float @llvm.maximumnum.f32(float %x, float %canon.y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_lhs_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.x = call float @llvm.canonicalize.f32(float %x)
+  %result = call float @llvm.maximumnum.f32(float %canon.x, float %y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_both_operands_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.x = call float @llvm.canonicalize.f32(float %x)
+  %canon.y = call float @llvm.canonicalize.f32(float %y)
+  %result = call float @llvm.maximumnum.f32(float %canon.x, float %canon.y)
+  ret float %result
+}
+
+define double @v_maximumnum_f64_1.0(double %x) {
+; GFX8-LABEL: v_maximumnum_f64_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], 1.0, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call double @llvm.maximumnum.f64(double %x, double 1.0)
+  ret double %result
+}
+
+define half @v_maximumnum_f16_s_v(half inreg %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX8-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_s_v:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_s_v:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_s_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call half @llvm.maximumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_v_s(half %x, half inreg %y) {
+; GFX8-LABEL: v_maximumnum_f16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call half @llvm.maximumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) {
+; GFX8-LABEL: v_maximumnum_f16_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX8-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX8-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX9-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX10-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v0, s1, s1
+; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v0, s1, s1
+; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call half @llvm.maximumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define float @v_maximumnum_f32_s_v(float inreg %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_s_v:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_s_v:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_s_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_v_s(float %x, float inreg %y) {
+; GFX8-LABEL: v_maximumnum_f32_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_s_s(float inreg %x, float inreg %y) {
+; GFX8-LABEL: v_maximumnum_f32_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v0, 1.0, s7
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v0, s7, s7
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v0, s7, s7
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v0, s1, s1
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v0, s1, s1
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define double @v_maximumnum_f64_s_v(double inreg %x, double %y) {
+; GFX8-LABEL: v_maximumnum_f64_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_s_v:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_s_v:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_s_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.maximumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_maximumnum_f64_v_s(double %x, double inreg %y) {
+; GFX8-LABEL: v_maximumnum_f64_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.maximumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_maximumnum_f64_s_s(double inreg %x, double inreg %y) {
+; GFX8-LABEL: v_maximumnum_f64_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], s[2:3], s[2:3]
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.maximumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define float @v_maximumnum_f32_fabs_rhs(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %result = call float @llvm.maximumnum.f32(float %x, float %fabs.y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_fneg_fabs_rhs(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, -1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, -|v1|, -|v1|
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %fneg.fabs.y = fneg float %fabs.y
+  %result = call float @llvm.maximumnum.f32(float %x, float %fneg.fabs.y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_fabs(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_fabs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e64 v0, 1.0, |v0|
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_fabs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_fabs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_fabs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f32_e64 v0, |v0|, |v0|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %result = call float @llvm.maximumnum.f32(float %fabs.x, float %fabs.y)
+  ret float %result
+}
+
+define float @v_maximumnum_f32_fneg(float %x, float %y) {
+; GFX8-LABEL: v_maximumnum_f32_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, -1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, -1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f32_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX9-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f32_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX10-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f32_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX11-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f32_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, -v1, -v1
+; GFX12-NEXT:    v_max_num_f32_e64 v0, -v0, -v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg float %x
+  %fneg.y = fneg float %y
+  %result = call float @llvm.maximumnum.f32(float %fneg.x, float %fneg.y)
+  ret float %result
+}
+
+define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %result = call half @llvm.maximumnum.f16(half %x, half %fabs.y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1|
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %fneg.fabs.y = fneg half %fabs.y
+  %result = call half @llvm.maximumnum.f16(half %x, half %fneg.fabs.y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_fabs(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_fabs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX8-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_fabs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_fabs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_fabs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f16_e64 v0, |v0|, |v0|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.x = call half @llvm.fabs.f16(half %x)
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %result = call half @llvm.maximumnum.f16(half %fabs.x, half %fabs.y)
+  ret half %result
+}
+
+define half @v_maximumnum_f16_fneg(half %x, half %y) {
+; GFX8-LABEL: v_maximumnum_f16_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX8-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f16_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX9-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f16_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX10-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f16_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f16_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, -v1, -v1
+; GFX12-NEXT:    v_max_num_f16_e64 v0, -v0, -v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg half %x
+  %fneg.y = fneg half %y
+  %result = call half @llvm.maximumnum.f16(half %fneg.x, half %fneg.y)
+  ret half %result
+}
+
+define double @v_maximumnum_f64_fneg(double %x, double %y) {
+; GFX8-LABEL: v_maximumnum_f64_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_f64_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_f64_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX10-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_f64_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX11-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_f64_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], -v[2:3], -v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg double %x
+  %fneg.y = fneg double %y
+  %result = call double @llvm.maximumnum.f64(double %fneg.x, double %fneg.y)
+  ret double %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
new file mode 100644
index 0000000000000..a2ba770067d16
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -0,0 +1,1690 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define half @v_minimumnum_f16(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call half @llvm.minimumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_nnan(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan half @llvm.minimumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_1.0(half %x) {
+; GFX8-LABEL: v_minimumnum_f16_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call half @llvm.minimumnum.f16(half %x, half 1.0)
+  ret half %result
+}
+
+define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
+; GFX8-LABEL: v_minimumnum_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GFX12-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
+  ret bfloat %result
+}
+
+define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
+; GFX8-LABEL: v_minimumnum_bf16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_bf16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_bf16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_bf16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_bf16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
+  ret bfloat %result
+}
+
+define float @v_minimumnum_f32(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_nnan(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define double @v_minimumnum_f64(double %x, double %y) {
+; GFX8-LABEL: v_minimumnum_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_minimumnum_f64_nnan(double %x, double %y) {
+; GFX8-LABEL: v_minimumnum_f64_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define float @v_minimumnum_f32_1.0(float %x) {
+; GFX8-LABEL: v_minimumnum_f32_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.minimumnum.f32(float %x, float 1.0)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_rhs_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.y = call float @llvm.canonicalize.f32(float %y)
+  %result = call float @llvm.minimumnum.f32(float %x, float %canon.y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_lhs_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.x = call float @llvm.canonicalize.f32(float %x)
+  %result = call float @llvm.minimumnum.f32(float %canon.x, float %y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_both_operands_not_snan(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %canon.x = call float @llvm.canonicalize.f32(float %x)
+  %canon.y = call float @llvm.canonicalize.f32(float %y)
+  %result = call float @llvm.minimumnum.f32(float %canon.x, float %canon.y)
+  ret float %result
+}
+
+define double @v_minimumnum_f64_1.0(double %x) {
+; GFX8-LABEL: v_minimumnum_f64_1.0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_1.0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_1.0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_1.0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_1.0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], 1.0, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call double @llvm.minimumnum.f64(double %x, double 1.0)
+  ret double %result
+}
+
+define half @v_minimumnum_f16_v_s(half %x, half inreg %y) {
+; GFX8-LABEL: v_minimumnum_f16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call half @llvm.minimumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) {
+; GFX8-LABEL: v_minimumnum_f16_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX8-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX8-NEXT:    v_min_f16_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX9-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX9-NEXT:    v_min_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v0, s7, s7
+; GFX10-NEXT:    v_max_f16_e64 v1, s6, s6
+; GFX10-NEXT:    v_min_f16_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v0, s1, s1
+; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v0, s1, s1
+; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call half @llvm.minimumnum.f16(half %x, half %y)
+  ret half %result
+}
+
+define float @v_minimumnum_f32_s_v(float inreg %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_s_v:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_s_v:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_s_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_v_s(float %x, float inreg %y) {
+; GFX8-LABEL: v_minimumnum_f32_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_s_s(float inreg %x, float inreg %y) {
+; GFX8-LABEL: v_minimumnum_f32_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v0, 1.0, s7
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; GFX8-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v0, s7, s7
+; GFX9-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX9-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v0, s7, s7
+; GFX10-NEXT:    v_max_f32_e64 v1, s6, s6
+; GFX10-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v0, s1, s1
+; GFX11-NEXT:    v_max_f32_e64 v1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v0, s1, s1
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v1, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %result
+}
+
+define double @v_minimumnum_f64_s_v(double inreg %x, double %y) {
+; GFX8-LABEL: v_minimumnum_f64_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_s_v:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_s_v:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_s_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_minimumnum_f64_v_s(double %x, double inreg %y) {
+; GFX8-LABEL: v_minimumnum_f64_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_v_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_v_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_v_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define double @v_minimumnum_f64_s_s(double inreg %x, double inreg %y) {
+; GFX8-LABEL: v_minimumnum_f64_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX8-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX9-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], s[16:17], s[16:17]
+; GFX10-NEXT:    v_max_f64 v[2:3], s[6:7], s[6:7]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_s_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], s[0:1], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_s_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], s[2:3], s[2:3]
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], s[0:1], s[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+ %result = call double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %result
+}
+
+define float @v_minimumnum_f32_fabs_rhs(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %result = call float @llvm.minimumnum.f32(float %x, float %fabs.y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_fneg_fabs_rhs(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, -1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, -|v1|, -|v1|
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %fneg.fabs.y = fneg float %fabs.y
+  %result = call float @llvm.minimumnum.f32(float %x, float %fneg.fabs.y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_fabs(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_fabs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e64 v1, 1.0, |v1|
+; GFX8-NEXT:    v_mul_f32_e64 v0, 1.0, |v0|
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_fabs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_fabs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_fabs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f32_e64 v0, |v0|, |v0|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %result = call float @llvm.minimumnum.f32(float %fabs.x, float %fabs.y)
+  ret float %result
+}
+
+define float @v_minimumnum_f32_fneg(float %x, float %y) {
+; GFX8-LABEL: v_minimumnum_f32_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, -1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, -1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f32_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX9-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f32_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX10-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f32_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX11-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f32_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, -v1, -v1
+; GFX12-NEXT:    v_max_num_f32_e64 v0, -v0, -v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg float %x
+  %fneg.y = fneg float %y
+  %result = call float @llvm.minimumnum.f32(float %fneg.x, float %fneg.y)
+  ret float %result
+}
+
+define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %result = call half @llvm.minimumnum.f16(half %x, half %fabs.y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1|
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %fneg.fabs.y = fneg half %fabs.y
+  %result = call half @llvm.minimumnum.f16(half %x, half %fneg.fabs.y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_fabs(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16_fabs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX8-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_fabs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_fabs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX10-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_fabs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    v_max_num_f16_e64 v0, |v0|, |v0|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.x = call half @llvm.fabs.f16(half %x)
+  %fabs.y = call half @llvm.fabs.f16(half %y)
+  %result = call half @llvm.minimumnum.f16(half %fabs.x, half %fabs.y)
+  ret half %result
+}
+
+define half @v_minimumnum_f16_fneg(half %x, half %y) {
+; GFX8-LABEL: v_minimumnum_f16_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX8-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f16_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX9-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f16_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX10-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f16_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f16_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e64 v1, -v1, -v1
+; GFX12-NEXT:    v_max_num_f16_e64 v0, -v0, -v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg half %x
+  %fneg.y = fneg half %y
+  %result = call half @llvm.minimumnum.f16(half %fneg.x, half %fneg.y)
+  ret half %result
+}
+
+define double @v_minimumnum_f64_fneg(double %x, double %y) {
+; GFX8-LABEL: v_minimumnum_f64_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_f64_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_f64_fneg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX10-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_f64_fneg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX11-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_f64_fneg:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e64 v[2:3], -v[2:3], -v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %fneg.x = fneg double %x
+  %fneg.y = fneg double %y
+  %result = call double @llvm.minimumnum.f64(double %fneg.x, double %fneg.y)
+  ret double %result
+}

From 1741b9c3d75ee34550210fadb9c6156419c3c892 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 11 Sep 2024 15:21:32 +0100
Subject: [PATCH 112/114] [LV] Generalize check lines for interleave group
 costs.

Check cost of all instructions in an interleave group, to prepare for
follow-up changes.
---
 .../X86/interleaved-load-f32-stride-5.ll      | 107 +++++++-
 .../X86/interleaved-load-f32-stride-7.ll      | 150 +++++++++-
 .../X86/interleaved-load-f32-stride-8.ll      | 175 +++++++++++-
 .../X86/interleaved-load-f64-stride-2.ll      |  34 ++-
 .../X86/interleaved-load-f64-stride-3.ll      |  56 +++-
 .../X86/interleaved-load-f64-stride-4.ll      |  78 +++++-
 .../X86/interleaved-load-f64-stride-5.ll      |  90 +++++-
 .../X86/interleaved-load-f64-stride-6.ll      | 111 +++++++-
 .../X86/interleaved-load-f64-stride-7.ll      | 132 ++++++++-
 .../X86/interleaved-load-f64-stride-8.ll      | 146 +++++++++-
 .../X86/interleaved-load-i16-stride-5.ll      | 151 +++++++++-
 .../X86/interleaved-load-i16-stride-7.ll      | 223 ++++++++++++++-
 .../X86/interleaved-load-i16-stride-8.ll      | 259 +++++++++++++++++-
 ...erleaved-load-i32-stride-4-indices-01uu.ll |  34 ++-
 .../X86/interleaved-load-i32-stride-5.ll      | 107 +++++++-
 .../X86/interleaved-load-i32-stride-7.ll      | 150 +++++++++-
 .../X86/interleaved-load-i32-stride-8.ll      | 175 +++++++++++-
 .../X86/interleaved-load-i64-stride-2.ll      |  34 ++-
 .../X86/interleaved-load-i64-stride-3.ll      |  56 +++-
 .../X86/interleaved-load-i64-stride-4.ll      |  78 +++++-
 .../X86/interleaved-load-i64-stride-5.ll      |  90 +++++-
 .../X86/interleaved-load-i64-stride-6.ll      | 111 +++++++-
 .../X86/interleaved-load-i64-stride-7.ll      | 132 ++++++++-
 .../X86/interleaved-load-i64-stride-8.ll      | 146 +++++++++-
 .../X86/interleaved-load-i8-stride-5.ll       | 151 +++++++++-
 .../X86/interleaved-load-i8-stride-6.ll       | 187 ++++++++++++-
 .../X86/interleaved-load-i8-stride-7.ll       | 223 ++++++++++++++-
 .../X86/interleaved-load-i8-stride-8.ll       | 259 +++++++++++++++++-
 .../X86/interleaved-store-f32-stride-8.ll     |   4 +
 .../X86/interleaved-store-f64-stride-8.ll     | 147 +++++++++-
 .../X86/interleaved-store-i64-stride-8.ll     | 147 +++++++++-
 .../X86/masked-interleaved-store-i16.ll       |  10 +
 32 files changed, 3923 insertions(+), 30 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
index de178cdf19308..29dce5f21173a 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,136 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 75 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 150 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 75 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 150 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 145 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
index 1f54b7485aa8f..0e7b1c58e587c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,179 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 23 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 105 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 105 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
index d53dca05155b7..8830aff579c32 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,30 +14,203 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 120 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 14 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 40 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 92 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX512:  LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
index 1575f92465d52..cfd3d7841caa2 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,35 +14,67 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 24 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 14 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 112 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 48 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
index 89175a65990f6..5ec5b51731385 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,32 +14,86 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 36 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 84 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
index 8db9fd364133e..450743df72325 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,107 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
index 25c49e3b8a811..5e5c718dba97d 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,116 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; SSE2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
 ; SSE2:  LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
index 42c980b6d3985..62541fa2368c6 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,137 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; SSE2:  LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
 ; SSE2:  LV: Found an estimated cost of 36 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 84 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
index 68afa6d17f02f..cfed8554b978b 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,158 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
 ; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
 ; SSE2:  LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
index 7894912c88fab..07939b914d022 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,27 +14,171 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
 ; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
 ; SSE2:  LV: Found an estimated cost of 48 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
index d8eaa0aad61d5..964a9b660942e 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,193 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
 ; SSE2:  LV: Found an estimated cost of 22 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
 ; SSE2:  LV: Found an estimated cost of 43 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; SSE2:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; SSE2:  LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX1:  LV: Found an estimated cost of 25 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX1:  LV: Found an estimated cost of 43 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX1:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX1:  LV: Found an estimated cost of 175 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX1:  LV: Found an estimated cost of 350 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX2:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX2:  LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX2:  LV: Found an estimated cost of 330 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 25 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 175 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 355 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 710 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 55 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 235 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
index 9c0d102a70d1e..6653198397dd2 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,265 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
 ; SSE2:  LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
 ; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
 ; SSE2:  LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
 ; SSE2:  LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX1:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX1:  LV: Found an estimated cost of 62 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX1:  LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX1:  LV: Found an estimated cost of 245 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX1:  LV: Found an estimated cost of 490 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX2:  LV: Found an estimated cost of 231 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX2:  LV: Found an estimated cost of 462 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 121 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 245 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 497 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 994 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 15 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 19 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 112 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
 ; AVX512BW:  LV: Found an estimated cost of 469 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
index 7654185635d3e..b3a5cbeccc09c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,301 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
 ; SSE2:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
 ; SSE2:  LV: Found an estimated cost of 68 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
 ; SSE2:  LV: Found an estimated cost of 136 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
 ; SSE2:  LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX1:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX1:  LV: Found an estimated cost of 68 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX1:  LV: Found an estimated cost of 136 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX1:  LV: Found an estimated cost of 280 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX1:  LV: Found an estimated cost of 560 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX2:  LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX2:  LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX2:  LV: Found an estimated cost of 528 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 68 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 136 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 280 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 568 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512DQ:  LV: Found an estimated cost of 1136 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i16, ptr %in7, align 2
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 64 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 148 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
 ; AVX512BW:  LV: Found an estimated cost of 616 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i16, ptr %in7, align 2
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
index 86590c0fa6a9c..c0ea210385dfd 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,35 +14,67 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
 ; SSE2:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
 ; SSE2:  LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; SSE2:  LV: Found an estimated cost of 60 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; SSE2:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX1:  LV: Found an estimated cost of 10 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX1:  LV: Found an estimated cost of 20 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX1:  LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX1:  LV: Found an estimated cost of 84 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX1:  LV: Found an estimated cost of 168 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX2:  LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 13 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
index 63901617bb9dd..2a261ca4de4fa 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,136 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 75 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 150 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 23 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 95 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 190 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX2:  LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 145 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
index 1eabac4e0b9c3..8bf3071d29fbe 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,179 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 49 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 105 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
 ; SSE2:  LV: Found an estimated cost of 210 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 133 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX1:  LV: Found an estimated cost of 266 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX2:  LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
 ; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
index a1bb2efd73963..3182de2df058a 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,30 +14,203 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 120 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
 ; SSE2:  LV: Found an estimated cost of 240 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 36 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 152 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX1:  LV: Found an estimated cost of 304 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX2:  LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 14 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 40 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4
 ; AVX512:  LV: Found an estimated cost of 92 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX512:  LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
index bd230166ebe78..27e2ee0392615 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,35 +14,67 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 22 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 44 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 88 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 176 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX2:  LV: Found an estimated cost of 48 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
index e03d3c2f8b3a4..c37723257c1f7 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,32 +14,86 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 84 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 33 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 66 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 132 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
index f7249666918dd..2eb7c5e93078f 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,31 +14,107 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 44 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 88 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 176 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
index 96946bd58dea1..c11da4309737d 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,116 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; SSE2:  LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
 ; SSE2:  LV: Found an estimated cost of 70 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 25 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 55 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX2:  LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
index 2355c6e8b57a1..de57af6ebe398 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,137 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; SSE2:  LV: Found an estimated cost of 42 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
 ; SSE2:  LV: Found an estimated cost of 84 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 30 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 66 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 132 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
index 646003a41dcf5..949c1af1fdad3 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,28 +14,158 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
 ; SSE2:  LV: Found an estimated cost of 49 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
 ; SSE2:  LV: Found an estimated cost of 98 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 77 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX1:  LV: Found an estimated cost of 154 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX2:  LV: Found an estimated cost of 126 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
 ; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
index 568ab74068f94..4388ccfbdcfc4 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -14,27 +14,171 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
 ; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
 ; SSE2:  LV: Found an estimated cost of 112 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 40 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 88 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX1:  LV: Found an estimated cost of 176 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX2:  LV: Found an estimated cost of 144 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
 ; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
index 6c1dd916311ab..6078fb440f9d1 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,193 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
 ; SSE2:  LV: Found an estimated cost of 38 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
 ; SSE2:  LV: Found an estimated cost of 75 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
 ; SSE2:  LV: Found an estimated cost of 155 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; SSE2:  LV: Found an estimated cost of 315 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX1:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX1:  LV: Found an estimated cost of 83 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX1:  LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX1:  LV: Found an estimated cost of 335 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX2:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX2:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX2:  LV: Found an estimated cost of 325 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 335 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 675 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 99 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 198 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 395 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
index 1ff3bc57a50d9..ed8bc84e771f8 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,229 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; SSE2:  LV: Found an estimated cost of 47 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
 ; SSE2:  LV: Found an estimated cost of 90 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
 ; SSE2:  LV: Found an estimated cost of 186 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; SSE2:  LV: Found an estimated cost of 378 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX1:  LV: Found an estimated cost of 27 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX1:  LV: Found an estimated cost of 52 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX1:  LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX1:  LV: Found an estimated cost of 198 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX1:  LV: Found an estimated cost of 402 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 46 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX2:  LV: Found an estimated cost of 88 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 45 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 85 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 810 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 25 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 49 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 119 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 237 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 591 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
index d77bca6b7aa5a..778a4e7dfd7d9 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,265 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
 ; SSE2:  LV: Found an estimated cost of 57 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
 ; SSE2:  LV: Found an estimated cost of 110 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
 ; SSE2:  LV: Found an estimated cost of 217 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
 ; SSE2:  LV: Found an estimated cost of 441 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX1:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX1:  LV: Found an estimated cost of 62 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX1:  LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX1:  LV: Found an estimated cost of 231 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX1:  LV: Found an estimated cost of 469 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX2:  LV: Found an estimated cost of 224 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX2:  LV: Found an estimated cost of 455 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 62 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 120 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 233 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 469 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 945 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 29 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 57 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 138 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 413 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
 ; AVX512BW:  LV: Found an estimated cost of 826 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
index 00ad2f68814b8..a230b5a0b1f2b 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -15,44 +15,301 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
 ; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
 ; SSE2:  LV: Found an estimated cost of 120 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
 ; SSE2:  LV: Found an estimated cost of 248 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
 ; SSE2:  LV: Found an estimated cost of 504 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; SSE2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX1:  LV: Found an estimated cost of 33 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX1:  LV: Found an estimated cost of 66 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX1:  LV: Found an estimated cost of 132 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX1:  LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX1:  LV: Found an estimated cost of 536 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX1:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX2:  LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX2:  LV: Found an estimated cost of 256 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX2:  LV: Found an estimated cost of 520 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 33 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 66 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 132 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 536 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512DQ:  LV: Found an estimated cost of 1080 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i8, ptr %in7, align 1
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 33 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 65 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 158 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 472 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
 ; AVX512BW:  LV: Found an estimated cost of 1100 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX512BW:  LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i8, ptr %in7, align 1
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll
index 678bb8917bd0d..2ad37bee35bed 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll
@@ -14,12 +14,14 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
 ; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v7, ptr %out7, align 4
 ; SSE2:  LV: Found an estimated cost of 56 for VF 4 For instruction: store float %v7, ptr %out7, align 4
 ; SSE2:  LV: Found an estimated cost of 112 for VF 8 For instruction: store float %v7, ptr %out7, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
 ; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v7, ptr %out7, align 4
 ; AVX1:  LV: Found an estimated cost of 56 for VF 4 For instruction: store float %v7, ptr %out7, align 4
 ; AVX1:  LV: Found an estimated cost of 120 for VF 8 For instruction: store float %v7, ptr %out7, align 4
@@ -27,6 +29,7 @@ define void @test() {
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
 ; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v7, ptr %out7, align 4
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: store float %v7, ptr %out7, align 4
 ; AVX2:  LV: Found an estimated cost of 120 for VF 8 For instruction: store float %v7, ptr %out7, align 4
@@ -34,6 +37,7 @@ define void @test() {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
 ; AVX512:  LV: Found an estimated cost of 11 for VF 2 For instruction: store float %v7, ptr %out7, align 4
 ; AVX512:  LV: Found an estimated cost of 23 for VF 4 For instruction: store float %v7, ptr %out7, align 4
 ; AVX512:  LV: Found an estimated cost of 46 for VF 8 For instruction: store float %v7, ptr %out7, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
index 394d1d4de00f5..c1a66c1a41d74 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v7, ptr %out7"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v., ptr %out."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -13,27 +13,172 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @test() {
 ; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 48 for VF 4 For instruction: store double %v7, ptr %out7, align 8
 ;
 ; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 56 for VF 4 For instruction: store double %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 112 for VF 8 For instruction: store double %v7, ptr %out7, align 8
 ;
 ; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: store double %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: store double %v7, ptr %out7, align 8
 ;
 ; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 23 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 46 for VF 4 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: store double %v7, ptr %out7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
index 4d9aad54b0c8f..7be9577960efe 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v7, ptr %out7"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v., ptr %out."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -13,27 +13,172 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @test() {
 ; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
 ; SSE2:  LV: Found an estimated cost of 112 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
 ;
 ; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 40 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 88 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX1:  LV: Found an estimated cost of 176 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
 ;
 ; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 40 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 88 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX2:  LV: Found an estimated cost of 176 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
 ;
 ; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 23 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 46 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
index 741dd0746b744..13a844230f89d 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
@@ -22,6 +22,8 @@ define void @test1(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; DISABLED_MASKED_STRIDED-LABEL: 'test1'
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 13 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
@@ -34,6 +36,8 @@ define void @test1(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED-LABEL: 'test1'
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 12 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
@@ -79,6 +83,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no
 ; DISABLED_MASKED_STRIDED-LABEL: 'test2'
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 10 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
@@ -91,6 +97,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no
 ; ENABLED_MASKED_STRIDED-LABEL: 'test2'
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
@@ -145,6 +153,7 @@ for.end:
 define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly %x, ptr noalias nocapture readnone %y) {
 ; DISABLED_MASKED_STRIDED-LABEL: 'test'
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2
@@ -152,6 +161,7 @@ define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: 'test'
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2

From 9a9f155df1ed13fdc690d713242b13508f6d725e Mon Sep 17 00:00:00 2001
From: Tyler Nowicki <tyler.nowicki@amd.com>
Date: Wed, 11 Sep 2024 10:29:06 -0400
Subject: [PATCH 113/114] [Coroutines] Split buildCoroutineFrame into
 normalization and frame building (#108076)

* Split buildCoroutineFrame into code related to normalization and code
related to actually building the coroutine frame.
* This will enable future specialization of buildCoroutineFrame for
different ABIs while the normalization can be done by splitCoroutine
prior to calling buildCoroutineFrame.

See RFC for more info:
https://discourse.llvm.org/t/rfc-abi-objects-for-coroutines/81057
---
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp  | 14 ++++++++------
 llvm/lib/Transforms/Coroutines/CoroInternal.h |  4 +++-
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp  |  3 ++-
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 4b76fc7936100..8ee4bfa3b888d 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1754,7 +1754,8 @@ static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB,
   if (depth == 0) return false;
 
   // If this is a suspend block, we're about to exit the resumption function.
-  if (isSuspendBlock(BB)) return true;
+  if (isSuspendBlock(BB))
+    return true;
 
   // Recurse into the successors.
   for (auto *Succ : successors(BB)) {
@@ -2288,9 +2289,8 @@ static void doRematerializations(
   rewriteMaterializableInstructions(AllRemats);
 }
 
-void coro::buildCoroutineFrame(
-    Function &F, Shape &Shape, TargetTransformInfo &TTI,
-    const std::function<bool(Instruction &)> &MaterializableCallback) {
+void coro::normalizeCoroutine(Function &F, coro::Shape &Shape,
+                              TargetTransformInfo &TTI) {
   // Don't eliminate swifterror in async functions that won't be split.
   if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty())
     eliminateSwiftError(F, Shape);
@@ -2337,10 +2337,12 @@ void coro::buildCoroutineFrame(
   // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will
   // never have its definition separated from the PHI by the suspend point.
   rewritePHIs(F);
+}
 
-  // Build suspend crossing info.
+void coro::buildCoroutineFrame(
+    Function &F, Shape &Shape,
+    const std::function<bool(Instruction &)> &MaterializableCallback) {
   SuspendCrossingInfo Checker(F, Shape.CoroSuspends, Shape.CoroEnds);
-
   doRematerializations(F, Checker, MaterializableCallback);
 
   const DominatorTree DT(F);
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index be86f96525b67..698c21a797420 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -281,8 +281,10 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
 };
 
 bool defaultMaterializable(Instruction &V);
+void normalizeCoroutine(Function &F, coro::Shape &Shape,
+                        TargetTransformInfo &TTI);
 void buildCoroutineFrame(
-    Function &F, Shape &Shape, TargetTransformInfo &TTI,
+    Function &F, Shape &Shape,
     const std::function<bool(Instruction &)> &MaterializableCallback);
 CallInst *createMustTailCall(DebugLoc Loc, Function *MustTailCallFn,
                              TargetTransformInfo &TTI,
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 494c4d632de95..dc3829d7f28eb 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -2030,7 +2030,8 @@ splitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones,
   lowerAwaitSuspends(F, Shape);
 
   simplifySuspendPoints(Shape);
-  buildCoroutineFrame(F, Shape, TTI, MaterializableCallback);
+  normalizeCoroutine(F, Shape, TTI);
+  buildCoroutineFrame(F, Shape, MaterializableCallback);
   replaceFrameSizeAndAlignment(Shape);
   bool isNoSuspendCoroutine = Shape.CoroSuspends.empty();
 

From 193e81e9612746fd58c43e53438257ecdc2cb415 Mon Sep 17 00:00:00 2001
From: tnowicki <tnowicki.nowicki@amd.com>
Date: Fri, 23 Aug 2024 18:24:54 -0400
Subject: [PATCH 114/114] [Coroutines] Move materialization code into its own
 utils

* Move materialization out of CoroFrame to MaterializationUtils.h
* Move spill related utilities that were used by materialization to SpillUtils
* Move isSuspendBlock (needed by materialization) to CoroInternal
---
 llvm/lib/Transforms/Coroutines/CMakeLists.txt |   1 +
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp  | 296 +----------------
 llvm/lib/Transforms/Coroutines/CoroInternal.h |   1 +
 llvm/lib/Transforms/Coroutines/Coroutines.cpp |   4 +
 .../Coroutines/MaterializationUtils.cpp       | 308 ++++++++++++++++++
 .../Coroutines/MaterializationUtils.h         |  30 ++
 llvm/lib/Transforms/Coroutines/SpillUtils.cpp |  24 +-
 llvm/lib/Transforms/Coroutines/SpillUtils.h   |   2 -
 8 files changed, 360 insertions(+), 306 deletions(-)
 create mode 100644 llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
 create mode 100644 llvm/lib/Transforms/Coroutines/MaterializationUtils.h

diff --git a/llvm/lib/Transforms/Coroutines/CMakeLists.txt b/llvm/lib/Transforms/Coroutines/CMakeLists.txt
index c6508174a7f10..46ef5cd4e8cfe 100644
--- a/llvm/lib/Transforms/Coroutines/CMakeLists.txt
+++ b/llvm/lib/Transforms/Coroutines/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_component_library(LLVMCoroutines
   CoroSplit.cpp
   SuspendCrossingInfo.cpp
   SpillUtils.cpp
+  MaterializationUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Coroutines
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 8ee4bfa3b888d..b74c9f01cd239 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -16,10 +16,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "CoroInternal.h"
+#include "MaterializationUtils.h"
 #include "SpillUtils.h"
 #include "SuspendCrossingInfo.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/StackLifetime.h"
@@ -36,135 +36,12 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
-#include <deque>
 #include <optional>
 
 using namespace llvm;
 
 extern cl::opt<bool> UseNewDbgInfoFormat;
 
-// The "coro-suspend-crossing" flag is very noisy. There is another debug type,
-// "coro-frame", which results in leaner debug spew.
-#define DEBUG_TYPE "coro-suspend-crossing"
-
-namespace {
-
-// RematGraph is used to construct a DAG for rematerializable instructions
-// When the constructor is invoked with a candidate instruction (which is
-// materializable) it builds a DAG of materializable instructions from that
-// point.
-// Typically, for each instruction identified as re-materializable across a
-// suspend point, a RematGraph will be created.
-struct RematGraph {
-  // Each RematNode in the graph contains the edges to instructions providing
-  // operands in the current node.
-  struct RematNode {
-    Instruction *Node;
-    SmallVector<RematNode *> Operands;
-    RematNode() = default;
-    RematNode(Instruction *V) : Node(V) {}
-  };
-
-  RematNode *EntryNode;
-  using RematNodeMap =
-      SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>;
-  RematNodeMap Remats;
-  const std::function<bool(Instruction &)> &MaterializableCallback;
-  SuspendCrossingInfo &Checker;
-
-  RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback,
-             Instruction *I, SuspendCrossingInfo &Checker)
-      : MaterializableCallback(MaterializableCallback), Checker(Checker) {
-    std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I);
-    EntryNode = FirstNode.get();
-    std::deque<std::unique_ptr<RematNode>> WorkList;
-    addNode(std::move(FirstNode), WorkList, cast<User>(I));
-    while (WorkList.size()) {
-      std::unique_ptr<RematNode> N = std::move(WorkList.front());
-      WorkList.pop_front();
-      addNode(std::move(N), WorkList, cast<User>(I));
-    }
-  }
-
-  void addNode(std::unique_ptr<RematNode> NUPtr,
-               std::deque<std::unique_ptr<RematNode>> &WorkList,
-               User *FirstUse) {
-    RematNode *N = NUPtr.get();
-    if (Remats.count(N->Node))
-      return;
-
-    // We haven't see this node yet - add to the list
-    Remats[N->Node] = std::move(NUPtr);
-    for (auto &Def : N->Node->operands()) {
-      Instruction *D = dyn_cast<Instruction>(Def.get());
-      if (!D || !MaterializableCallback(*D) ||
-          !Checker.isDefinitionAcrossSuspend(*D, FirstUse))
-        continue;
-
-      if (Remats.count(D)) {
-        // Already have this in the graph
-        N->Operands.push_back(Remats[D].get());
-        continue;
-      }
-
-      bool NoMatch = true;
-      for (auto &I : WorkList) {
-        if (I->Node == D) {
-          NoMatch = false;
-          N->Operands.push_back(I.get());
-          break;
-        }
-      }
-      if (NoMatch) {
-        // Create a new node
-        std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D);
-        N->Operands.push_back(ChildNode.get());
-        WorkList.push_back(std::move(ChildNode));
-      }
-    }
-  }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  static std::string getBasicBlockLabel(const BasicBlock *BB) {
-    if (BB->hasName())
-      return BB->getName().str();
-
-    std::string S;
-    raw_string_ostream OS(S);
-    BB->printAsOperand(OS, false);
-    return OS.str().substr(1);
-  }
-
-  void dump() const {
-    dbgs() << "Entry (";
-    dbgs() << getBasicBlockLabel(EntryNode->Node->getParent());
-    dbgs() << ") : " << *EntryNode->Node << "\n";
-    for (auto &E : Remats) {
-      dbgs() << *(E.first) << "\n";
-      for (RematNode *U : E.second->Operands)
-        dbgs() << "  " << *U->Node << "\n";
-    }
-  }
-#endif
-};
-} // end anonymous namespace
-
-namespace llvm {
-
-template <> struct GraphTraits<RematGraph *> {
-  using NodeRef = RematGraph::RematNode *;
-  using ChildIteratorType = RematGraph::RematNode **;
-
-  static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; }
-  static ChildIteratorType child_begin(NodeRef N) {
-    return N->Operands.begin();
-  }
-  static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }
-};
-
-} // end namespace llvm
-
-#undef DEBUG_TYPE // "coro-suspend-crossing"
 #define DEBUG_TYPE "coro-frame"
 
 namespace {
@@ -268,15 +145,6 @@ static void dumpSpills(StringRef Title, const coro::SpillInfo &Spills) {
       I->dump();
   }
 }
-static void dumpRemats(
-    StringRef Title,
-    const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) {
-  dbgs() << "------------- " << Title << "--------------\n";
-  for (const auto &E : RM) {
-    E.second->dump();
-    dbgs() << "--\n";
-  }
-}
 
 static void dumpAllocas(const SmallVectorImpl<coro::AllocaInfo> &Allocas) {
   dbgs() << "------------- Allocas --------------\n";
@@ -1634,93 +1502,6 @@ static void rewritePHIs(Function &F) {
     rewritePHIs(*BB);
 }
 
-/// Default materializable callback
-// Check for instructions that we can recreate on resume as opposed to spill
-// the result into a coroutine frame.
-bool coro::defaultMaterializable(Instruction &V) {
-  return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) ||
-          isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V));
-}
-
-// For each instruction identified as materializable across the suspend point,
-// and its associated DAG of other rematerializable instructions,
-// recreate the DAG of instructions after the suspend point.
-static void rewriteMaterializableInstructions(
-    const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8>
-        &AllRemats) {
-  // This has to be done in 2 phases
-  // Do the remats and record the required defs to be replaced in the
-  // original use instructions
-  // Once all the remats are complete, replace the uses in the final
-  // instructions with the new defs
-  typedef struct {
-    Instruction *Use;
-    Instruction *Def;
-    Instruction *Remat;
-  } ProcessNode;
-
-  SmallVector<ProcessNode> FinalInstructionsToProcess;
-
-  for (const auto &E : AllRemats) {
-    Instruction *Use = E.first;
-    Instruction *CurrentMaterialization = nullptr;
-    RematGraph *RG = E.second.get();
-    ReversePostOrderTraversal<RematGraph *> RPOT(RG);
-    SmallVector<Instruction *> InstructionsToProcess;
-
-    // If the target use is actually a suspend instruction then we have to
-    // insert the remats into the end of the predecessor (there should only be
-    // one). This is so that suspend blocks always have the suspend instruction
-    // as the first instruction.
-    auto InsertPoint = &*Use->getParent()->getFirstInsertionPt();
-    if (isa<AnyCoroSuspendInst>(Use)) {
-      BasicBlock *SuspendPredecessorBlock =
-          Use->getParent()->getSinglePredecessor();
-      assert(SuspendPredecessorBlock && "malformed coro suspend instruction");
-      InsertPoint = SuspendPredecessorBlock->getTerminator();
-    }
-
-    // Note: skip the first instruction as this is the actual use that we're
-    // rematerializing everything for.
-    auto I = RPOT.begin();
-    ++I;
-    for (; I != RPOT.end(); ++I) {
-      Instruction *D = (*I)->Node;
-      CurrentMaterialization = D->clone();
-      CurrentMaterialization->setName(D->getName());
-      CurrentMaterialization->insertBefore(InsertPoint);
-      InsertPoint = CurrentMaterialization;
-
-      // Replace all uses of Def in the instructions being added as part of this
-      // rematerialization group
-      for (auto &I : InstructionsToProcess)
-        I->replaceUsesOfWith(D, CurrentMaterialization);
-
-      // Don't replace the final use at this point as this can cause problems
-      // for other materializations. Instead, for any final use that uses a
-      // define that's being rematerialized, record the replace values
-      for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i)
-        if (Use->getOperand(i) == D) // Is this operand pointing to oldval?
-          FinalInstructionsToProcess.push_back(
-              {Use, D, CurrentMaterialization});
-
-      InstructionsToProcess.push_back(CurrentMaterialization);
-    }
-  }
-
-  // Finally, replace the uses with the defines that we've just rematerialized
-  for (auto &R : FinalInstructionsToProcess) {
-    if (auto *PN = dyn_cast<PHINode>(R.Use)) {
-      assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
-                                                "values in the PHINode");
-      PN->replaceAllUsesWith(R.Remat);
-      PN->eraseFromParent();
-      continue;
-    }
-    R.Use->replaceUsesOfWith(R.Def, R.Remat);
-  }
-}
-
 // Splits the block at a particular instruction unless it is the first
 // instruction in the block with a single predecessor.
 static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) {
@@ -1741,10 +1522,6 @@ static void splitAround(Instruction *I, const Twine &Name) {
   splitBlockIfNotFirst(I->getNextNode(), "After" + Name);
 }
 
-static bool isSuspendBlock(BasicBlock *BB) {
-  return isa<AnyCoroSuspendInst>(BB->front());
-}
-
 /// After we split the coroutine, will the given basic block be along
 /// an obvious exit path for the resumption function?
 static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB,
@@ -1754,7 +1531,7 @@ static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB,
   if (depth == 0) return false;
 
   // If this is a suspend block, we're about to exit the resumption function.
-  if (isSuspendBlock(BB))
+  if (coro::isSuspendBlock(BB))
     return true;
 
   // Recurse into the successors.
@@ -1995,7 +1772,8 @@ static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
   DomSet.insert(&F.getEntryBlock());
   for (auto *CSI : Shape.CoroSuspends) {
     BasicBlock *SuspendBlock = CSI->getParent();
-    assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() &&
+    assert(coro::isSuspendBlock(SuspendBlock) &&
+           SuspendBlock->getSingleSuccessor() &&
            "should have split coro.suspend into its own block");
     DomSet.insert(SuspendBlock->getSingleSuccessor());
   }
@@ -2227,68 +2005,6 @@ void coro::salvageDebugInfo(
   }
 }
 
-static void doRematerializations(
-    Function &F, SuspendCrossingInfo &Checker,
-    const std::function<bool(Instruction &)> &MaterializableCallback) {
-  if (F.hasOptNone())
-    return;
-
-  coro::SpillInfo Spills;
-
-  // See if there are materializable instructions across suspend points
-  // We record these as the starting point to also identify materializable
-  // defs of uses in these operations
-  for (Instruction &I : instructions(F)) {
-    if (!MaterializableCallback(I))
-      continue;
-    for (User *U : I.users())
-      if (Checker.isDefinitionAcrossSuspend(I, U))
-        Spills[&I].push_back(cast<Instruction>(U));
-  }
-
-  // Process each of the identified rematerializable instructions
-  // and add predecessor instructions that can also be rematerialized.
-  // This is actually a graph of instructions since we could potentially
-  // have multiple uses of a def in the set of predecessor instructions.
-  // The approach here is to maintain a graph of instructions for each bottom
-  // level instruction - where we have a unique set of instructions (nodes)
-  // and edges between them. We then walk the graph in reverse post-dominator
-  // order to insert them past the suspend point, but ensure that ordering is
-  // correct. We also rely on CSE removing duplicate defs for remats of
-  // different instructions with a def in common (rather than maintaining more
-  // complex graphs for each suspend point)
-
-  // We can do this by adding new nodes to the list for each suspend
-  // point. Then using standard GraphTraits to give a reverse post-order
-  // traversal when we insert the nodes after the suspend
-  SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats;
-  for (auto &E : Spills) {
-    for (Instruction *U : E.second) {
-      // Don't process a user twice (this can happen if the instruction uses
-      // more than one rematerializable def)
-      if (AllRemats.count(U))
-        continue;
-
-      // Constructor creates the whole RematGraph for the given Use
-      auto RematUPtr =
-          std::make_unique<RematGraph>(MaterializableCallback, U, Checker);
-
-      LLVM_DEBUG(dbgs() << "***** Next remat group *****\n";
-                 ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get());
-                 for (auto I = RPOT.begin(); I != RPOT.end();
-                      ++I) { (*I)->Node->dump(); } dbgs()
-                 << "\n";);
-
-      AllRemats[U] = std::move(RematUPtr);
-    }
-  }
-
-  // Rewrite materializable instructions to be materialized at the use
-  // point.
-  LLVM_DEBUG(dumpRemats("Materializations", AllRemats));
-  rewriteMaterializableInstructions(AllRemats);
-}
-
 void coro::normalizeCoroutine(Function &F, coro::Shape &Shape,
                               TargetTransformInfo &TTI) {
   // Don't eliminate swifterror in async functions that won't be split.
@@ -2324,8 +2040,8 @@ void coro::normalizeCoroutine(Function &F, coro::Shape &Shape,
       IRBuilder<> Builder(AsyncEnd);
       SmallVector<Value *, 8> Args(AsyncEnd->args());
       auto Arguments = ArrayRef<Value *>(Args).drop_front(3);
-      auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn,
-                                      TTI, Arguments, Builder);
+      auto *Call = coro::createMustTailCall(
+          AsyncEnd->getDebugLoc(), MustTailCallFn, TTI, Arguments, Builder);
       splitAround(Call, "MustTailCall.Before.CoroEnd");
     }
   }
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 698c21a797420..891798f53b2d0 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -21,6 +21,7 @@ class CallGraph;
 
 namespace coro {
 
+bool isSuspendBlock(BasicBlock *BB);
 bool declaresAnyIntrinsic(const Module &M);
 bool declaresIntrinsics(const Module &M,
                         const std::initializer_list<StringRef>);
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index be257339e0ac4..cdc442bc819c3 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -100,6 +100,10 @@ static bool isCoroutineIntrinsicName(StringRef Name) {
 }
 #endif
 
+bool coro::isSuspendBlock(BasicBlock *BB) {
+  return isa<AnyCoroSuspendInst>(BB->front());
+}
+
 bool coro::declaresAnyIntrinsic(const Module &M) {
   for (StringRef Name : CoroIntrinsics) {
     assert(isCoroutineIntrinsicName(Name) && "not a coroutine intrinsic");
diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
new file mode 100644
index 0000000000000..708e8734175f9
--- /dev/null
+++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
@@ -0,0 +1,308 @@
+//===- MaterializationUtils.cpp - Builds and manipulates coroutine frame
+//-------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file contains classes used to materialize insts after suspends points.
+//===----------------------------------------------------------------------===//
+
+#include "MaterializationUtils.h"
+#include "SpillUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include <deque>
+
+using namespace llvm;
+
+using namespace coro;
+
+// The "coro-suspend-crossing" flag is very noisy. There is another debug type,
+// "coro-frame", which results in leaner debug spew.
+#define DEBUG_TYPE "coro-suspend-crossing"
+
+namespace {
+
+// RematGraph is used to construct a DAG for rematerializable instructions
+// When the constructor is invoked with a candidate instruction (which is
+// materializable) it builds a DAG of materializable instructions from that
+// point.
+// Typically, for each instruction identified as re-materializable across a
+// suspend point, a RematGraph will be created.
+struct RematGraph {
+  // Each RematNode in the graph contains the edges to instructions providing
+  // operands in the current node.
+  struct RematNode {
+    Instruction *Node;
+    SmallVector<RematNode *> Operands;
+    RematNode() = default;
+    RematNode(Instruction *V) : Node(V) {}
+  };
+
+  RematNode *EntryNode;
+  using RematNodeMap =
+      SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>;
+  RematNodeMap Remats;
+  const std::function<bool(Instruction &)> &MaterializableCallback;
+  SuspendCrossingInfo &Checker;
+
+  RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback,
+             Instruction *I, SuspendCrossingInfo &Checker)
+      : MaterializableCallback(MaterializableCallback), Checker(Checker) {
+    std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I);
+    EntryNode = FirstNode.get();
+    std::deque<std::unique_ptr<RematNode>> WorkList;
+    addNode(std::move(FirstNode), WorkList, cast<User>(I));
+    while (WorkList.size()) {
+      std::unique_ptr<RematNode> N = std::move(WorkList.front());
+      WorkList.pop_front();
+      addNode(std::move(N), WorkList, cast<User>(I));
+    }
+  }
+
+  void addNode(std::unique_ptr<RematNode> NUPtr,
+               std::deque<std::unique_ptr<RematNode>> &WorkList,
+               User *FirstUse) {
+    RematNode *N = NUPtr.get();
+    if (Remats.count(N->Node))
+      return;
+
+    // We haven't see this node yet - add to the list
+    Remats[N->Node] = std::move(NUPtr);
+    for (auto &Def : N->Node->operands()) {
+      Instruction *D = dyn_cast<Instruction>(Def.get());
+      if (!D || !MaterializableCallback(*D) ||
+          !Checker.isDefinitionAcrossSuspend(*D, FirstUse))
+        continue;
+
+      if (Remats.count(D)) {
+        // Already have this in the graph
+        N->Operands.push_back(Remats[D].get());
+        continue;
+      }
+
+      bool NoMatch = true;
+      for (auto &I : WorkList) {
+        if (I->Node == D) {
+          NoMatch = false;
+          N->Operands.push_back(I.get());
+          break;
+        }
+      }
+      if (NoMatch) {
+        // Create a new node
+        std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D);
+        N->Operands.push_back(ChildNode.get());
+        WorkList.push_back(std::move(ChildNode));
+      }
+    }
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  static std::string getBasicBlockLabel(const BasicBlock *BB) {
+    if (BB->hasName())
+      return BB->getName().str();
+
+    std::string S;
+    raw_string_ostream OS(S);
+    BB->printAsOperand(OS, false);
+    return OS.str().substr(1);
+  }
+
+  void dump() const {
+    dbgs() << "Entry (";
+    dbgs() << getBasicBlockLabel(EntryNode->Node->getParent());
+    dbgs() << ") : " << *EntryNode->Node << "\n";
+    for (auto &E : Remats) {
+      dbgs() << *(E.first) << "\n";
+      for (RematNode *U : E.second->Operands)
+        dbgs() << "  " << *U->Node << "\n";
+    }
+  }
+#endif
+};
+
+} // namespace
+
+namespace llvm {
+template <> struct GraphTraits<RematGraph *> {
+  using NodeRef = RematGraph::RematNode *;
+  using ChildIteratorType = RematGraph::RematNode **;
+
+  static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; }
+  static ChildIteratorType child_begin(NodeRef N) {
+    return N->Operands.begin();
+  }
+  static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }
+};
+
+} // end namespace llvm
+
+// For each instruction identified as materializable across the suspend point,
+// and its associated DAG of other rematerializable instructions,
+// recreate the DAG of instructions after the suspend point.
+static void rewriteMaterializableInstructions(
+    const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8>
+        &AllRemats) {
+  // This has to be done in 2 phases
+  // Do the remats and record the required defs to be replaced in the
+  // original use instructions
+  // Once all the remats are complete, replace the uses in the final
+  // instructions with the new defs
+  typedef struct {
+    Instruction *Use;
+    Instruction *Def;
+    Instruction *Remat;
+  } ProcessNode;
+
+  SmallVector<ProcessNode> FinalInstructionsToProcess;
+
+  for (const auto &E : AllRemats) {
+    Instruction *Use = E.first;
+    Instruction *CurrentMaterialization = nullptr;
+    RematGraph *RG = E.second.get();
+    ReversePostOrderTraversal<RematGraph *> RPOT(RG);
+    SmallVector<Instruction *> InstructionsToProcess;
+
+    // If the target use is actually a suspend instruction then we have to
+    // insert the remats into the end of the predecessor (there should only be
+    // one). This is so that suspend blocks always have the suspend instruction
+    // as the first instruction.
+    auto InsertPoint = &*Use->getParent()->getFirstInsertionPt();
+    if (isa<AnyCoroSuspendInst>(Use)) {
+      BasicBlock *SuspendPredecessorBlock =
+          Use->getParent()->getSinglePredecessor();
+      assert(SuspendPredecessorBlock && "malformed coro suspend instruction");
+      InsertPoint = SuspendPredecessorBlock->getTerminator();
+    }
+
+    // Note: skip the first instruction as this is the actual use that we're
+    // rematerializing everything for.
+    auto I = RPOT.begin();
+    ++I;
+    for (; I != RPOT.end(); ++I) {
+      Instruction *D = (*I)->Node;
+      CurrentMaterialization = D->clone();
+      CurrentMaterialization->setName(D->getName());
+      CurrentMaterialization->insertBefore(InsertPoint);
+      InsertPoint = CurrentMaterialization;
+
+      // Replace all uses of Def in the instructions being added as part of this
+      // rematerialization group
+      for (auto &I : InstructionsToProcess)
+        I->replaceUsesOfWith(D, CurrentMaterialization);
+
+      // Don't replace the final use at this point as this can cause problems
+      // for other materializations. Instead, for any final use that uses a
+      // define that's being rematerialized, record the replace values
+      for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i)
+        if (Use->getOperand(i) == D) // Is this operand pointing to oldval?
+          FinalInstructionsToProcess.push_back(
+              {Use, D, CurrentMaterialization});
+
+      InstructionsToProcess.push_back(CurrentMaterialization);
+    }
+  }
+
+  // Finally, replace the uses with the defines that we've just rematerialized
+  for (auto &R : FinalInstructionsToProcess) {
+    if (auto *PN = dyn_cast<PHINode>(R.Use)) {
+      assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
+                                                "values in the PHINode");
+      PN->replaceAllUsesWith(R.Remat);
+      PN->eraseFromParent();
+      continue;
+    }
+    R.Use->replaceUsesOfWith(R.Def, R.Remat);
+  }
+}
+
+/// Default materializable callback
+// Check for instructions that we can recreate on resume as opposed to spill
+// the result into a coroutine frame.
+bool llvm::coro::defaultMaterializable(Instruction &V) {
+  return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) ||
+          isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V));
+}
+
+bool llvm::coro::isTriviallyMaterializable(Instruction &V) {
+  return defaultMaterializable(V);
+}
+
+#ifndef NDEBUG
+static void dumpRemats(
+    StringRef Title,
+    const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) {
+  dbgs() << "------------- " << Title << "--------------\n";
+  for (const auto &E : RM) {
+    E.second->dump();
+    dbgs() << "--\n";
+  }
+}
+#endif
+
+void coro::doRematerializations(
+    Function &F, SuspendCrossingInfo &Checker,
+    std::function<bool(Instruction &)> IsMaterializable) {
+  if (F.hasOptNone())
+    return;
+
+  coro::SpillInfo Spills;
+
+  // See if there are materializable instructions across suspend points
+  // We record these as the starting point to also identify materializable
+  // defs of uses in these operations
+  for (Instruction &I : instructions(F)) {
+    if (!IsMaterializable(I))
+      continue;
+    for (User *U : I.users())
+      if (Checker.isDefinitionAcrossSuspend(I, U))
+        Spills[&I].push_back(cast<Instruction>(U));
+  }
+
+  // Process each of the identified rematerializable instructions
+  // and add predecessor instructions that can also be rematerialized.
+  // This is actually a graph of instructions since we could potentially
+  // have multiple uses of a def in the set of predecessor instructions.
+  // The approach here is to maintain a graph of instructions for each bottom
+  // level instruction - where we have a unique set of instructions (nodes)
+  // and edges between them. We then walk the graph in reverse post-dominator
+  // order to insert them past the suspend point, but ensure that ordering is
+  // correct. We also rely on CSE removing duplicate defs for remats of
+  // different instructions with a def in common (rather than maintaining more
+  // complex graphs for each suspend point)
+
+  // We can do this by adding new nodes to the list for each suspend
+  // point. Then using standard GraphTraits to give a reverse post-order
+  // traversal when we insert the nodes after the suspend
+  SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats;
+  for (auto &E : Spills) {
+    for (Instruction *U : E.second) {
+      // Don't process a user twice (this can happen if the instruction uses
+      // more than one rematerializable def)
+      if (AllRemats.count(U))
+        continue;
+
+      // Constructor creates the whole RematGraph for the given Use
+      auto RematUPtr =
+          std::make_unique<RematGraph>(IsMaterializable, U, Checker);
+
+      LLVM_DEBUG(dbgs() << "***** Next remat group *****\n";
+                 ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get());
+                 for (auto I = RPOT.begin(); I != RPOT.end();
+                      ++I) { (*I)->Node->dump(); } dbgs()
+                 << "\n";);
+
+      AllRemats[U] = std::move(RematUPtr);
+    }
+  }
+
+  // Rewrite materializable instructions to be materialized at the use
+  // point.
+  LLVM_DEBUG(dumpRemats("Materializations", AllRemats));
+  rewriteMaterializableInstructions(AllRemats);
+}
diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.h b/llvm/lib/Transforms/Coroutines/MaterializationUtils.h
new file mode 100644
index 0000000000000..f391851c97b3b
--- /dev/null
+++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.h
@@ -0,0 +1,30 @@
+//===- MaterializationUtils.h - Utilities for doing materialization -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SuspendCrossingInfo.h"
+#include "llvm/IR/Instruction.h"
+
+#ifndef LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
+#define LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
+
+namespace llvm {
+
+namespace coro {
+
+// True if I is trivially rematerialzable, e.g. InsertElementInst
+bool isTriviallyMaterializable(Instruction &I);
+
+// Performs rematerialization, invoked from buildCoroutineFrame.
+void doRematerializations(Function &F, SuspendCrossingInfo &Checker,
+                          std::function<bool(Instruction &)> IsMaterializable);
+
+} // namespace coro
+
+} // namespace llvm
+
+#endif // LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index d71b0a336d471..4c12e66f288db 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -23,17 +23,6 @@ namespace {
 
 typedef SmallPtrSet<BasicBlock *, 8> VisitedBlocksSet;
 
-static bool isSuspendBlock(BasicBlock *BB) {
-  return isa<AnyCoroSuspendInst>(BB->front());
-}
-
-// Check for structural coroutine intrinsics that should not be spilled into
-// the coroutine frame.
-static bool isCoroutineStructureIntrinsic(Instruction &I) {
-  return isa<CoroIdInst>(&I) || isa<CoroSaveInst>(&I) ||
-         isa<CoroSuspendInst>(&I);
-}
-
 /// Does control flow starting at the given block ever reach a suspend
 /// instruction before reaching a block in VisitedOrFreeBBs?
 static bool isSuspendReachableFrom(BasicBlock *From,
@@ -45,7 +34,7 @@ static bool isSuspendReachableFrom(BasicBlock *From,
     return false;
 
   // We assume that we'll already have split suspends into their own blocks.
-  if (isSuspendBlock(From))
+  if (coro::isSuspendBlock(From))
     return true;
 
   // Recurse on the successors.
@@ -448,6 +437,13 @@ static void collectFrameAlloca(AllocaInst *AI, const coro::Shape &Shape,
 
 } // namespace
 
+// Check for structural coroutine intrinsics that should not be spilled into
+// the coroutine frame.
+bool isCoroutineStructureIntrinsic(Instruction &I) {
+  return isa<CoroIdInst>(&I) || isa<CoroSaveInst>(&I) ||
+         isa<CoroSuspendInst>(&I);
+}
+
 void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
                            const SuspendCrossingInfo &Checker) {
   // Collect the spills for arguments and other not-materializable values.
@@ -626,6 +622,6 @@ BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
   return InsertPt;
 }
 
-} // End namespace coro.
+} // namespace coro
 
-} // End namespace llvm.
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.h b/llvm/lib/Transforms/Coroutines/SpillUtils.h
index de0ff0bcd3a4f..8843b611e0842 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.h
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.h
@@ -29,8 +29,6 @@ struct AllocaInfo {
         MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {}
 };
 
-bool isSuspendBlock(BasicBlock *BB);
-
 void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
                            const SuspendCrossingInfo &Checker);
 void collectSpillsAndAllocasFromInsts(