diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
index f007dfe549990..2349e51477b7d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
@@ -12,6 +12,15 @@ after the call. When the function returns such a parameter also as constant refe
 then the returned reference can be used after the object it refers to has been
 destroyed.
 
+This issue can be resolved by declaring an overload of the problematic function
+where the ``const &`` parameter is instead declared as ``&&``. The developer has
+to ensure that the implementation of that function does not produce a
+use-after-free, the exact error that this check is warning against.
+Marking such an ``&&`` overload as ``deleted``, will silence the warning as 
+well. In the case of different ``const &`` parameters being returned depending
+on the control flow of the function, an overload where all problematic
+``const &`` parameters have been declared as ``&&`` will resolve the issue.
+
 Example
 -------
 
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 847bf4baf7488..c124fefc78611 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -2571,8 +2571,8 @@ with the `offsetof` macro.
 
 .. _alpha-core-StackAddressAsyncEscape:
 
-alpha.core.StackAddressAsyncEscape (C)
-""""""""""""""""""""""""""""""""""""""
+alpha.core.StackAddressAsyncEscape (ObjC)
+"""""""""""""""""""""""""""""""""""""""""
 Check that addresses to stack memory do not escape the function that involves dispatch_after or dispatch_async.
 This checker is a part of ``core.StackAddressEscape``, but is temporarily disabled until some false positives are fixed.
 
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 62c382b67ad14..5448bd841959f 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -728,6 +728,9 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   case CK_ZNVER4:
     defineCPUMacros(Builder, "znver4");
     break;
+  case CK_ZNVER5:
+    defineCPUMacros(Builder, "znver5");
+    break;
   case CK_Geode:
     defineCPUMacros(Builder, "geode");
     break;
@@ -1626,6 +1629,7 @@ std::optional<unsigned> X86TargetInfo::getCPUCacheLineSize() const {
     case CK_ZNVER2:
     case CK_ZNVER3:
     case CK_ZNVER4:
+    case CK_ZNVER5:
     // Deprecated
     case CK_x86_64:
     case CK_x86_64_v2:
diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index 9ffc249c8d1a2..dcaf09e8f2c55 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -66,6 +66,8 @@ module _Builtin_intrinsics [system] [extern_c] {
     textual header "__wmmintrin_aes.h"
     textual header "__wmmintrin_pclmul.h"
 
+    textual header "mm3dnow.h"
+
     explicit module mm_malloc {
       requires !freestanding
       header "mm_malloc.h"
@@ -122,10 +124,6 @@ module _Builtin_intrinsics [system] [extern_c] {
       header "popcntintrin.h"
     }
 
-    explicit module mm3dnow {
-      header "mm3dnow.h"
-    }
-
     explicit module aes_pclmul {
       header "wmmintrin.h"
       export aes
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c b/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
index 471a31a8c5eac..8a2bc93dd6cd0 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
@@ -3,6 +3,8 @@
 // RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-LE
 // RUN: %clang_cc1 -O3 -triple powerpc64-unknown-unknown -target-cpu pwr10 \
 // RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-BE
+// RUN: %clang_cc1 -O0 -triple powerpc64le-unknown-unknown -target-cpu pwr10 \
+// RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-LE-NOOPT
 
 // CHECK-LE-LABEL: @test1(
 // CHECK-LE-NEXT:  entry:
@@ -16,6 +18,42 @@
 // CHECK-BE-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2:![0-9]+]]
 // CHECK-BE-NEXT:    ret void
 //
+// CHECK-LE-NOOPT-LABEL: @test1(
+// CHECK-LE-NOOPT-NEXT:  entry:
+// CHECK-LE-NOOPT-NEXT:    [[VQP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VPP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VC1_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[VC2_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[VC3_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[VC4_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[RESP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VQ:%.*]] = alloca <512 x i1>, align 64
+// CHECK-LE-NOOPT-NEXT:    [[VP:%.*]] = alloca <256 x i1>, align 32
+// CHECK-LE-NOOPT-NEXT:    [[RES:%.*]] = alloca <512 x i1>, align 64
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VQP:%.*]], ptr [[VQP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VPP:%.*]], ptr [[VPP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC1:%.*]], ptr [[VC1_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC2:%.*]], ptr [[VC2_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC3:%.*]], ptr [[VC3_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC4:%.*]], ptr [[VC4_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store ptr [[RESP:%.*]], ptr [[RESP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VQP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP1:%.*]] = load <512 x i1>, ptr [[TMP0]], align 64
+// CHECK-LE-NOOPT-NEXT:    store <512 x i1> [[TMP1]], ptr [[VQ]], align 64
+// CHECK-LE-NOOPT-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VPP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP3:%.*]] = load <256 x i1>, ptr [[TMP2]], align 32
+// CHECK-LE-NOOPT-NEXT:    store <256 x i1> [[TMP3]], ptr [[VP]], align 32
+// CHECK-LE-NOOPT-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC1_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC2_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[VC3_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr [[VC4_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP8:%.*]] = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[TMP7]], <16 x i8> [[TMP6]], <16 x i8> [[TMP5]], <16 x i8> [[TMP4]])
+// CHECK-LE-NOOPT-NEXT:    store <512 x i1> [[TMP8]], ptr [[RES]], align 64
+// CHECK-LE-NOOPT-NEXT:    [[TMP9:%.*]] = load <512 x i1>, ptr [[RES]], align 64
+// CHECK-LE-NOOPT-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[RESP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store <512 x i1> [[TMP9]], ptr [[TMP10]], align 64
+// CHECK-LE-NOOPT-NEXT:    ret void
+//
 void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1, vector unsigned char vc2,
             vector unsigned char vc3, vector unsigned char vc4, unsigned char *resp) {
   __vector_quad vq = *((__vector_quad *)vqp);
@@ -37,6 +75,36 @@ void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1, vec
 // CHECK-BE-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP:%.*]], align 32, !tbaa [[TBAA6:![0-9]+]]
 // CHECK-BE-NEXT:    ret void
 //
+// CHECK-LE-NOOPT-LABEL: @test2(
+// CHECK-LE-NOOPT-NEXT:  entry:
+// CHECK-LE-NOOPT-NEXT:    [[VQP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VPP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VC1_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[VC2_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[RESP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VQ:%.*]] = alloca <512 x i1>, align 64
+// CHECK-LE-NOOPT-NEXT:    [[VP:%.*]] = alloca <256 x i1>, align 32
+// CHECK-LE-NOOPT-NEXT:    [[RES:%.*]] = alloca <256 x i1>, align 32
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VQP:%.*]], ptr [[VQP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VPP:%.*]], ptr [[VPP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC1:%.*]], ptr [[VC1_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC2:%.*]], ptr [[VC2_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store ptr [[RESP:%.*]], ptr [[RESP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VQP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP1:%.*]] = load <512 x i1>, ptr [[TMP0]], align 64
+// CHECK-LE-NOOPT-NEXT:    store <512 x i1> [[TMP1]], ptr [[VQ]], align 64
+// CHECK-LE-NOOPT-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VPP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP3:%.*]] = load <256 x i1>, ptr [[TMP2]], align 32
+// CHECK-LE-NOOPT-NEXT:    store <256 x i1> [[TMP3]], ptr [[VP]], align 32
+// CHECK-LE-NOOPT-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC1_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC2_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP6:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP5]], <16 x i8> [[TMP4]])
+// CHECK-LE-NOOPT-NEXT:    store <256 x i1> [[TMP6]], ptr [[RES]], align 64
+// CHECK-LE-NOOPT-NEXT:    [[TMP7:%.*]] = load <256 x i1>, ptr [[RES]], align 32
+// CHECK-LE-NOOPT-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[RESP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store <256 x i1> [[TMP7]], ptr [[TMP8]], align 32
+// CHECK-LE-NOOPT-NEXT:    ret void
+//
 void test2(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1,
             vector unsigned char vc2, unsigned char *resp) {
   __vector_quad vq = *((__vector_quad *)vqp);
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c
index a414a2827b2c4..39c040967dc0c 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c
@@ -16,18 +16,18 @@
 // CHECK-NEXT:    store <16 x i8> [[VC:%.*]], ptr [[VC_ADDR]], align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[VQP]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VQP]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <512 x i1>, ptr [[TMP2]], align 64
-// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[VQ1]], align 64
-// CHECK-NEXT:    [[TMP4:%.*]] = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
-// CHECK-NEXT:    store <512 x i1> [[TMP4]], ptr [[VQ2]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VQP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[VQ1]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[VQ2]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-// CHECK-NEXT:    store <512 x i1> [[TMP7]], ptr [[VQ3]], align 64
-// CHECK-NEXT:    [[TMP8:%.*]] = load <512 x i1>, ptr [[VQ3]], align 64
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[VQP]], align 8
-// CHECK-NEXT:    store <512 x i1> [[TMP8]], ptr [[TMP9]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+// CHECK-NEXT:    store <512 x i1> [[TMP6]], ptr [[VQ3]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <512 x i1>, ptr [[VQ3]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[VQP]], align 8
+// CHECK-NEXT:    store <512 x i1> [[TMP7]], ptr [[TMP8]], align 64
 // CHECK-NEXT:    ret void
 //
 // CHECK-BE-LABEL: @testVQLocal(
@@ -42,18 +42,18 @@
 // CHECK-BE-NEXT:    store <16 x i8> [[VC:%.*]], ptr [[VC_ADDR]], align 16
 // CHECK-BE-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-BE-NEXT:    store ptr [[TMP0]], ptr [[VQP]], align 8
-// CHECK-BE-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VQP]], align 8
-// CHECK-BE-NEXT:    [[TMP3:%.*]] = load <512 x i1>, ptr [[TMP2]], align 64
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP3]], ptr [[VQ1]], align 64
-// CHECK-BE-NEXT:    [[TMP4:%.*]] = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP4]], ptr [[VQ2]], align 64
+// CHECK-BE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VQP]], align 8
+// CHECK-BE-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP2]], ptr [[VQ1]], align 64
+// CHECK-BE-NEXT:    [[TMP3:%.*]] = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP3]], ptr [[VQ2]], align 64
+// CHECK-BE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-BE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP7:%.*]] = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP7]], ptr [[VQ3]], align 64
-// CHECK-BE-NEXT:    [[TMP8:%.*]] = load <512 x i1>, ptr [[VQ3]], align 64
-// CHECK-BE-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[VQP]], align 8
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP8]], ptr [[TMP9]], align 64
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP6]], ptr [[VQ3]], align 64
+// CHECK-BE-NEXT:    [[TMP7:%.*]] = load <512 x i1>, ptr [[VQ3]], align 64
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[VQP]], align 8
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP7]], ptr [[TMP8]], align 64
 // CHECK-BE-NEXT:    ret void
 //
 void testVQLocal(int *ptr, vector unsigned char vc) {
@@ -79,24 +79,24 @@ void testVQLocal(int *ptr, vector unsigned char vc) {
 // CHECK-NEXT:    store <16 x i8> [[VC:%.*]], ptr [[VC_ADDR]], align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[VPP]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VPP]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i1>, ptr [[TMP2]], align 32
-// CHECK-NEXT:    store <256 x i1> [[TMP3]], ptr [[VP1]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>, ptr [[TMP1]], align 32
+// CHECK-NEXT:    store <256 x i1> [[TMP2]], ptr [[VP1]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-// CHECK-NEXT:    store <256 x i1> [[TMP6]], ptr [[VP2]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
+// CHECK-NEXT:    store <256 x i1> [[TMP5]], ptr [[VP2]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP8]], <16 x i8> [[TMP7]])
-// CHECK-NEXT:    store <256 x i1> [[TMP9]], ptr [[VP2]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
-// CHECK-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP10]], <16 x i8> [[TMP11]])
-// CHECK-NEXT:    store <512 x i1> [[TMP12]], ptr [[VQ]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
-// CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[VPP]], align 8
-// CHECK-NEXT:    store <256 x i1> [[TMP13]], ptr [[TMP14]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP7]], <16 x i8> [[TMP6]])
+// CHECK-NEXT:    store <256 x i1> [[TMP8]], ptr [[VP2]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
+// CHECK-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP9]], <16 x i8> [[TMP10]])
+// CHECK-NEXT:    store <512 x i1> [[TMP11]], ptr [[VQ]], align 64
+// CHECK-NEXT:    [[TMP12:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
+// CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[VPP]], align 8
+// CHECK-NEXT:    store <256 x i1> [[TMP12]], ptr [[TMP13]], align 32
 // CHECK-NEXT:    ret void
 //
 // CHECK-BE-LABEL: @testVPLocal(
@@ -112,24 +112,24 @@ void testVQLocal(int *ptr, vector unsigned char vc) {
 // CHECK-BE-NEXT:    store <16 x i8> [[VC:%.*]], ptr [[VC_ADDR]], align 16
 // CHECK-BE-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-BE-NEXT:    store ptr [[TMP0]], ptr [[VPP]], align 8
-// CHECK-BE-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VPP]], align 8
-// CHECK-BE-NEXT:    [[TMP3:%.*]] = load <256 x i1>, ptr [[TMP2]], align 32
-// CHECK-BE-NEXT:    store <256 x i1> [[TMP3]], ptr [[VP1]], align 32
+// CHECK-BE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VPP]], align 8
+// CHECK-BE-NEXT:    [[TMP2:%.*]] = load <256 x i1>, ptr [[TMP1]], align 32
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP2]], ptr [[VP1]], align 32
+// CHECK-BE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-BE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP6:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-// CHECK-BE-NEXT:    store <256 x i1> [[TMP6]], ptr [[VP2]], align 64
+// CHECK-BE-NEXT:    [[TMP5:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP5]], ptr [[VP2]], align 64
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-BE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP9:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-// CHECK-BE-NEXT:    store <256 x i1> [[TMP9]], ptr [[VP2]], align 64
-// CHECK-BE-NEXT:    [[TMP10:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
-// CHECK-BE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP12:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP10]], <16 x i8> [[TMP11]])
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP12]], ptr [[VQ]], align 64
-// CHECK-BE-NEXT:    [[TMP13:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
-// CHECK-BE-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[VPP]], align 8
-// CHECK-BE-NEXT:    store <256 x i1> [[TMP13]], ptr [[TMP14]], align 32
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP6]], <16 x i8> [[TMP7]])
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP8]], ptr [[VP2]], align 64
+// CHECK-BE-NEXT:    [[TMP9:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
+// CHECK-BE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
+// CHECK-BE-NEXT:    [[TMP11:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP9]], <16 x i8> [[TMP10]])
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP11]], ptr [[VQ]], align 64
+// CHECK-BE-NEXT:    [[TMP12:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
+// CHECK-BE-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[VPP]], align 8
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP12]], ptr [[TMP13]], align 32
 // CHECK-BE-NEXT:    ret void
 //
 void testVPLocal(int *ptr, vector unsigned char vc) {
@@ -154,18 +154,18 @@ void testVPLocal(int *ptr, vector unsigned char vc) {
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ACC_ADDR]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
 // CHECK-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
-// CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
-// CHECK-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
-// CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP10]], align 16
-// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
-// CHECK-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP12]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
+// CHECK-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP5]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP7]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
+// CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
+// CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP11]], align 16
 // CHECK-NEXT:    ret void
 //
 // CHECK-BE-LABEL: @testRestrictQualifiedPointer2(
@@ -178,18 +178,18 @@ void testVPLocal(int *ptr, vector unsigned char vc) {
 // CHECK-BE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ACC_ADDR]], align 8
 // CHECK-BE-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
 // CHECK-BE-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
-// CHECK-BE-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
-// CHECK-BE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 16
-// CHECK-BE-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
-// CHECK-BE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP8]], align 16
-// CHECK-BE-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
-// CHECK-BE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP10]], align 16
-// CHECK-BE-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
-// CHECK-BE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP12]], align 16
+// CHECK-BE-NEXT:    [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-BE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP5]], align 16
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-BE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP7]], align 16
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-BE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP9]], align 16
+// CHECK-BE-NEXT:    [[TMP10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-BE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP11]], align 16
 // CHECK-BE-NEXT:    ret void
 //
 void testRestrictQualifiedPointer2(__vector_quad *__restrict acc) {
@@ -207,18 +207,18 @@ void testRestrictQualifiedPointer2(__vector_quad *__restrict acc) {
 // CHECK-NEXT:    [[TMP1:%.*]] = load volatile ptr, ptr [[ACC_ADDR]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
 // CHECK-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
-// CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
-// CHECK-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
-// CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP10]], align 16
-// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
-// CHECK-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP12]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
+// CHECK-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP5]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP7]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
+// CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
+// CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP11]], align 16
 // CHECK-NEXT:    ret void
 //
 // CHECK-BE-LABEL: @testVolatileQualifiedPointer2(
@@ -231,18 +231,18 @@ void testRestrictQualifiedPointer2(__vector_quad *__restrict acc) {
 // CHECK-BE-NEXT:    [[TMP1:%.*]] = load volatile ptr, ptr [[ACC_ADDR]], align 8
 // CHECK-BE-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
 // CHECK-BE-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
-// CHECK-BE-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
-// CHECK-BE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 16
-// CHECK-BE-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
-// CHECK-BE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP8]], align 16
-// CHECK-BE-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
-// CHECK-BE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP10]], align 16
-// CHECK-BE-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
-// CHECK-BE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP12]], align 16
+// CHECK-BE-NEXT:    [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-BE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP5]], align 16
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-BE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP7]], align 16
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-BE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP9]], align 16
+// CHECK-BE-NEXT:    [[TMP10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-BE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP11]], align 16
 // CHECK-BE-NEXT:    ret void
 //
 void testVolatileQualifiedPointer2(__vector_quad *__volatile acc) {
diff --git a/clang/test/CodeGen/target-builtin-noerror.c b/clang/test/CodeGen/target-builtin-noerror.c
index 14024e3953182..2a05074d7c2b6 100644
--- a/clang/test/CodeGen/target-builtin-noerror.c
+++ b/clang/test/CodeGen/target-builtin-noerror.c
@@ -207,4 +207,5 @@ void verifycpustrings(void) {
   (void)__builtin_cpu_is("znver2");
   (void)__builtin_cpu_is("znver3");
   (void)__builtin_cpu_is("znver4");
+  (void)__builtin_cpu_is("znver5");
 }
diff --git a/clang/test/Driver/x86-march.c b/clang/test/Driver/x86-march.c
index cc993b53937c1..3bc2a82ae778d 100644
--- a/clang/test/Driver/x86-march.c
+++ b/clang/test/Driver/x86-march.c
@@ -242,6 +242,10 @@
 // RUN: %clang -target x86_64-unknown-unknown -c -### %s -march=znver4 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=znver4
 // znver4: "-target-cpu" "znver4"
+//
+// RUN: %clang -target x86_64-unknown-unknown -c -### %s -march=znver5 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=znver5
+// znver5: "-target-cpu" "znver5"
 
 // RUN: %clang -target x86_64 -c -### %s -march=x86-64 2>&1 | FileCheck %s --check-prefix=x86-64
 // x86-64: "-target-cpu" "x86-64"
diff --git a/clang/test/Frontend/x86-target-cpu.c b/clang/test/Frontend/x86-target-cpu.c
index 6c8502ac2c21e..f2885a040c370 100644
--- a/clang/test/Frontend/x86-target-cpu.c
+++ b/clang/test/Frontend/x86-target-cpu.c
@@ -38,5 +38,6 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu znver2 -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu znver3 -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu znver4 -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu znver5 -verify %s
 //
 // expected-no-diagnostics
diff --git a/clang/test/Misc/target-invalid-cpu-note/x86.c b/clang/test/Misc/target-invalid-cpu-note/x86.c
index 607192a5409ba..7879676040af4 100644
--- a/clang/test/Misc/target-invalid-cpu-note/x86.c
+++ b/clang/test/Misc/target-invalid-cpu-note/x86.c
@@ -99,6 +99,7 @@
 // X86-SAME: {{^}}, znver2
 // X86-SAME: {{^}}, znver3
 // X86-SAME: {{^}}, znver4
+// X86-SAME: {{^}}, znver5
 // X86-SAME: {{^}}, x86-64
 // X86-SAME: {{^}}, x86-64-v2
 // X86-SAME: {{^}}, x86-64-v3
@@ -175,6 +176,7 @@
 // X86_64-SAME: {{^}}, znver2
 // X86_64-SAME: {{^}}, znver3
 // X86_64-SAME: {{^}}, znver4
+// X86_64-SAME: {{^}}, znver5
 // X86_64-SAME: {{^}}, x86-64
 // X86_64-SAME: {{^}}, x86-64-v2
 // X86_64-SAME: {{^}}, x86-64-v3
@@ -278,6 +280,7 @@
 // TUNE_X86-SAME: {{^}}, znver2
 // TUNE_X86-SAME: {{^}}, znver3
 // TUNE_X86-SAME: {{^}}, znver4
+// TUNE_X86-SAME: {{^}}, znver5
 // TUNE_X86-SAME: {{^}}, x86-64
 // TUNE_X86-SAME: {{^}}, geode
 // TUNE_X86-SAME: {{$}}
@@ -379,6 +382,7 @@
 // TUNE_X86_64-SAME: {{^}}, znver2
 // TUNE_X86_64-SAME: {{^}}, znver3
 // TUNE_X86_64-SAME: {{^}}, znver4
+// TUNE_X86_64-SAME: {{^}}, znver5
 // TUNE_X86_64-SAME: {{^}}, x86-64
 // TUNE_X86_64-SAME: {{^}}, geode
 // TUNE_X86_64-SAME: {{$}}
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index 49646d94d920c..a149c69ee0cdb 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -3923,6 +3923,148 @@
 // CHECK_ZNVER4_M64: #define __znver4 1
 // CHECK_ZNVER4_M64: #define __znver4__ 1
 
+// RUN: %clang -march=znver5 -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER5_M32
+// CHECK_ZNVER5_M32-NOT: #define __3dNOW_A__ 1
+// CHECK_ZNVER5_M32-NOT: #define __3dNOW__ 1
+// CHECK_ZNVER5_M32: #define __ADX__ 1
+// CHECK_ZNVER5_M32: #define __AES__ 1
+// CHECK_ZNVER5_M32: #define __AVX2__ 1
+// CHECK_ZNVER5_M32: #define __AVX512BF16__ 1
+// CHECK_ZNVER5_M32: #define __AVX512BITALG__ 1
+// CHECK_ZNVER5_M32: #define __AVX512BW__ 1
+// CHECK_ZNVER5_M32: #define __AVX512CD__ 1
+// CHECK_ZNVER5_M32: #define __AVX512DQ__ 1
+// CHECK_ZNVER5_M32: #define __AVX512F__ 1
+// CHECK_ZNVER5_M32: #define __AVX512IFMA__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VBMI2__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VBMI__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VL__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VNNI__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VP2INTERSECT__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VPOPCNTDQ__ 1
+// CHECK_ZNVER5_M32: #define __AVXVNNI__ 1
+// CHECK_ZNVER5_M32: #define __AVX__ 1
+// CHECK_ZNVER5_M32: #define __BMI2__ 1
+// CHECK_ZNVER5_M32: #define __BMI__ 1
+// CHECK_ZNVER5_M32: #define __CLFLUSHOPT__ 1
+// CHECK_ZNVER5_M32: #define __CLWB__ 1
+// CHECK_ZNVER5_M32: #define __CLZERO__ 1
+// CHECK_ZNVER5_M32: #define __F16C__ 1
+// CHECK_ZNVER5_M32-NOT: #define __FMA4__ 1
+// CHECK_ZNVER5_M32: #define __FMA__ 1
+// CHECK_ZNVER5_M32: #define __FSGSBASE__ 1
+// CHECK_ZNVER5_M32: #define __GFNI__ 1
+// CHECK_ZNVER5_M32: #define __LZCNT__ 1
+// CHECK_ZNVER5_M32: #define __MMX__ 1
+// CHECK_ZNVER5_M32: #define __MOVDIR64B__ 1
+// CHECK_ZNVER5_M32: #define __MOVDIRI__ 1
+// CHECK_ZNVER5_M32: #define __PCLMUL__ 1
+// CHECK_ZNVER5_M32: #define __PKU__ 1
+// CHECK_ZNVER5_M32: #define __POPCNT__ 1
+// CHECK_ZNVER5_M32: #define __PREFETCHI__ 1
+// CHECK_ZNVER5_M32: #define __PRFCHW__ 1
+// CHECK_ZNVER5_M32: #define __RDPID__ 1
+// CHECK_ZNVER5_M32: #define __RDPRU__ 1
+// CHECK_ZNVER5_M32: #define __RDRND__ 1
+// CHECK_ZNVER5_M32: #define __RDSEED__ 1
+// CHECK_ZNVER5_M32: #define __SHA__ 1
+// CHECK_ZNVER5_M32: #define __SSE2_MATH__ 1
+// CHECK_ZNVER5_M32: #define __SSE2__ 1
+// CHECK_ZNVER5_M32: #define __SSE3__ 1
+// CHECK_ZNVER5_M32: #define __SSE4A__ 1
+// CHECK_ZNVER5_M32: #define __SSE4_1__ 1
+// CHECK_ZNVER5_M32: #define __SSE4_2__ 1
+// CHECK_ZNVER5_M32: #define __SSE_MATH__ 1
+// CHECK_ZNVER5_M32: #define __SSE__ 1
+// CHECK_ZNVER5_M32: #define __SSSE3__ 1
+// CHECK_ZNVER5_M32-NOT: #define __TBM__ 1
+// CHECK_ZNVER5_M32: #define __WBNOINVD__ 1
+// CHECK_ZNVER5_M32-NOT: #define __XOP__ 1
+// CHECK_ZNVER5_M32: #define __XSAVEC__ 1
+// CHECK_ZNVER5_M32: #define __XSAVEOPT__ 1
+// CHECK_ZNVER5_M32: #define __XSAVES__ 1
+// CHECK_ZNVER5_M32: #define __XSAVE__ 1
+// CHECK_ZNVER5_M32: #define __i386 1
+// CHECK_ZNVER5_M32: #define __i386__ 1
+// CHECK_ZNVER5_M32: #define __tune_znver5__ 1
+// CHECK_ZNVER5_M32: #define __znver5 1
+// CHECK_ZNVER5_M32: #define __znver5__ 1
+
+// RUN: %clang -march=znver5 -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER5_M64
+// CHECK_ZNVER5_M64-NOT: #define __3dNOW_A__ 1
+// CHECK_ZNVER5_M64-NOT: #define __3dNOW__ 1
+// CHECK_ZNVER5_M64: #define __ADX__ 1
+// CHECK_ZNVER5_M64: #define __AES__ 1
+// CHECK_ZNVER5_M64: #define __AVX2__ 1
+// CHECK_ZNVER5_M64: #define __AVX512BF16__ 1
+// CHECK_ZNVER5_M64: #define __AVX512BITALG__ 1
+// CHECK_ZNVER5_M64: #define __AVX512BW__ 1
+// CHECK_ZNVER5_M64: #define __AVX512CD__ 1
+// CHECK_ZNVER5_M64: #define __AVX512DQ__ 1
+// CHECK_ZNVER5_M64: #define __AVX512F__ 1
+// CHECK_ZNVER5_M64: #define __AVX512IFMA__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VBMI2__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VBMI__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VL__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VNNI__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VP2INTERSECT__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VPOPCNTDQ__ 1
+// CHECK_ZNVER5_M64: #define __AVXVNNI__ 1
+// CHECK_ZNVER5_M64: #define __AVX__ 1
+// CHECK_ZNVER5_M64: #define __BMI2__ 1
+// CHECK_ZNVER5_M64: #define __BMI__ 1
+// CHECK_ZNVER5_M64: #define __CLFLUSHOPT__ 1
+// CHECK_ZNVER5_M64: #define __CLWB__ 1
+// CHECK_ZNVER5_M64: #define __CLZERO__ 1
+// CHECK_ZNVER5_M64: #define __F16C__ 1
+// CHECK_ZNVER5_M64-NOT: #define __FMA4__ 1
+// CHECK_ZNVER5_M64: #define __FMA__ 1
+// CHECK_ZNVER5_M64: #define __FSGSBASE__ 1
+// CHECK_ZNVER5_M64: #define __GFNI__ 1
+// CHECK_ZNVER5_M64: #define __LZCNT__ 1
+// CHECK_ZNVER5_M64: #define __MMX__ 1
+// CHECK_ZNVER5_M64: #define __MOVDIR64B__ 1
+// CHECK_ZNVER5_M64: #define __MOVDIRI__ 1
+// CHECK_ZNVER5_M64: #define __PCLMUL__ 1
+// CHECK_ZNVER5_M64: #define __PKU__ 1
+// CHECK_ZNVER5_M64: #define __POPCNT__ 1
+// CHECK_ZNVER5_M64: #define __PREFETCHI__ 1
+// CHECK_ZNVER5_M64: #define __PRFCHW__ 1
+// CHECK_ZNVER5_M64: #define __RDPID__ 1
+// CHECK_ZNVER5_M64: #define __RDPRU__ 1
+// CHECK_ZNVER5_M64: #define __RDRND__ 1
+// CHECK_ZNVER5_M64: #define __RDSEED__ 1
+// CHECK_ZNVER5_M64: #define __SHA__ 1
+// CHECK_ZNVER5_M64: #define __SSE2_MATH__ 1
+// CHECK_ZNVER5_M64: #define __SSE2__ 1
+// CHECK_ZNVER5_M64: #define __SSE3__ 1
+// CHECK_ZNVER5_M64: #define __SSE4A__ 1
+// CHECK_ZNVER5_M64: #define __SSE4_1__ 1
+// CHECK_ZNVER5_M64: #define __SSE4_2__ 1
+// CHECK_ZNVER5_M64: #define __SSE_MATH__ 1
+// CHECK_ZNVER5_M64: #define __SSE__ 1
+// CHECK_ZNVER5_M64: #define __SSSE3__ 1
+// CHECK_ZNVER5_M64-NOT: #define __TBM__ 1
+// CHECK_ZNVER5_M64: #define __VAES__ 1
+// CHECK_ZNVER5_M64: #define __VPCLMULQDQ__ 1
+// CHECK_ZNVER5_M64: #define __WBNOINVD__ 1
+// CHECK_ZNVER5_M64-NOT: #define __XOP__ 1
+// CHECK_ZNVER5_M64: #define __XSAVEC__ 1
+// CHECK_ZNVER5_M64: #define __XSAVEOPT__ 1
+// CHECK_ZNVER5_M64: #define __XSAVES__ 1
+// CHECK_ZNVER5_M64: #define __XSAVE__ 1
+// CHECK_ZNVER5_M64: #define __amd64 1
+// CHECK_ZNVER5_M64: #define __amd64__ 1
+// CHECK_ZNVER5_M64: #define __tune_znver5__ 1
+// CHECK_ZNVER5_M64: #define __x86_64 1
+// CHECK_ZNVER5_M64: #define __x86_64__ 1
+// CHECK_ZNVER5_M64: #define __znver5 1
+// CHECK_ZNVER5_M64: #define __znver5__ 1
+
 // End X86/GCC/Linux tests ------------------
 
 // Begin PPC/GCC/Linux tests ----------------
diff --git a/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c b/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c
new file mode 100644
index 0000000000000..8c7e46c6eca9c
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c
@@ -0,0 +1,8 @@
+void foo() {
+    // expected-error@+1{{use of undeclared identifier 'a'}}
+    a = 2; a = 2;
+    b = 2; b = 2;
+    // expected-error@+1 3{{use of undeclared identifier 'c'}}
+    c = 2; c = 2;
+    // expected-error 2{{asdf}}
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c.expected b/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c.expected
new file mode 100644
index 0000000000000..6214ff382f449
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c.expected
@@ -0,0 +1,8 @@
+void foo() {
+    // expected-error@+1 2{{use of undeclared identifier 'a'}}
+    a = 2; a = 2;
+    // expected-error@+1 2{{use of undeclared identifier 'b'}}
+    b = 2; b = 2;
+    // expected-error@+1 2{{use of undeclared identifier 'c'}}
+    c = 2; c = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c b/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c
new file mode 100644
index 0000000000000..0210ac35fd5cd
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c
@@ -0,0 +1,8 @@
+void foo() {
+         //     expected-error@+1    2      {{use of undeclared identifier 'a'}}
+    a = 2; a = 2; b = 2; b = 2; c = 2;
+         //     expected-error@+1    2      {{asdf}}
+    d = 2;
+    e = 2; f = 2;                 //     expected-error    2      {{use of undeclared identifier 'e'}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c.expected b/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c.expected
new file mode 100644
index 0000000000000..5c5aaeeef97ac
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c.expected
@@ -0,0 +1,11 @@
+void foo() {
+         //     expected-error@+3          {{use of undeclared identifier 'c'}}
+         //     expected-error@+2    2      {{use of undeclared identifier 'b'}}
+         //     expected-error@+1    2      {{use of undeclared identifier 'a'}}
+    a = 2; a = 2; b = 2; b = 2; c = 2;
+         //     expected-error@+1          {{use of undeclared identifier 'd'}}
+    d = 2;
+    //     expected-error@+1          {{use of undeclared identifier 'f'}}
+    e = 2; f = 2;                 //     expected-error          {{use of undeclared identifier 'e'}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c b/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c
new file mode 100644
index 0000000000000..1aa8d088e9727
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c
@@ -0,0 +1,11 @@
+void foo() {
+    a = 2;
+    // expected-error@-1{{use of undeclared identifier 'a'}}
+    b = 2;// expected-error{{use of undeclared identifier 'b'}}
+    c = 2;
+    // expected-error@5{{use of undeclared identifier 'c'}}
+    d = 2; // expected-error-re{{use of {{.*}} identifier 'd'}}
+
+    e = 2; // error to trigger mismatch
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c.expected b/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c.expected
new file mode 100644
index 0000000000000..6b621061bbfbb
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c.expected
@@ -0,0 +1,12 @@
+void foo() {
+    a = 2;
+    // expected-error@-1{{use of undeclared identifier 'a'}}
+    b = 2;// expected-error{{use of undeclared identifier 'b'}}
+    c = 2;
+    // expected-error@5{{use of undeclared identifier 'c'}}
+    d = 2; // expected-error-re{{use of {{.*}} identifier 'd'}}
+
+    // expected-error@+1{{use of undeclared identifier 'e'}}
+    e = 2; // error to trigger mismatch
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c b/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c
new file mode 100644
index 0000000000000..e230e0a337bf4
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c
@@ -0,0 +1,6 @@
+void foo() {
+    a = 2;
+    b = 2;
+
+    c = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c.expected b/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c.expected
new file mode 100644
index 0000000000000..27dc1f30a26fa
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c.expected
@@ -0,0 +1,9 @@
+void foo() {
+    // expected-error@+1{{use of undeclared identifier 'a'}}
+    a = 2;
+    // expected-error@+1{{use of undeclared identifier 'b'}}
+    b = 2;
+
+    // expected-error@+1{{use of undeclared identifier 'c'}}
+    c = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c b/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c
new file mode 100644
index 0000000000000..03f723d44bbe8
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c
@@ -0,0 +1,8 @@
+void foo() {
+    a = 2; b = 2; c = 2;
+}
+
+void bar() {
+    x = 2; y = 2; z = 2;
+    // expected-error@-1{{use of undeclared identifier 'x'}}
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c.expected b/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c.expected
new file mode 100644
index 0000000000000..24b57f4353d95
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c.expected
@@ -0,0 +1,13 @@
+void foo() {
+    // expected-error@+3{{use of undeclared identifier 'c'}}
+    // expected-error@+2{{use of undeclared identifier 'b'}}
+    // expected-error@+1{{use of undeclared identifier 'a'}}
+    a = 2; b = 2; c = 2;
+}
+
+void bar() {
+    x = 2; y = 2; z = 2;
+    // expected-error@-1{{use of undeclared identifier 'x'}}
+    // expected-error@-2{{use of undeclared identifier 'y'}}
+    // expected-error@-3{{use of undeclared identifier 'z'}}
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-checks.c b/clang/test/utils/update-verify-tests/Inputs/no-checks.c
new file mode 100644
index 0000000000000..8fd1f7cd33370
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-checks.c
@@ -0,0 +1,3 @@
+void foo() {
+    bar = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-checks.c.expected b/clang/test/utils/update-verify-tests/Inputs/no-checks.c.expected
new file mode 100644
index 0000000000000..e80548fbe50f2
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-checks.c.expected
@@ -0,0 +1,4 @@
+void foo() {
+    // expected-error@+1{{use of undeclared identifier 'bar'}}
+    bar = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-diags.c b/clang/test/utils/update-verify-tests/Inputs/no-diags.c
new file mode 100644
index 0000000000000..66d169be43940
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-diags.c
@@ -0,0 +1,5 @@
+void foo() {
+    // expected-error@+1{{asdf}}
+    int a = 2;
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-diags.c.expected b/clang/test/utils/update-verify-tests/Inputs/no-diags.c.expected
new file mode 100644
index 0000000000000..0523028494570
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-diags.c.expected
@@ -0,0 +1,5 @@
+// expected-no-diagnostics
+void foo() {
+    int a = 2;
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c b/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c
new file mode 100644
index 0000000000000..78b72e1357da7
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c
@@ -0,0 +1,4 @@
+// expected-no-diagnostics
+void foo() {
+    a = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c.expected b/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c.expected
new file mode 100644
index 0000000000000..d948ffce56189
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c.expected
@@ -0,0 +1,4 @@
+void foo() {
+    // expected-error@+1{{use of undeclared identifier 'a'}}
+    a = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c b/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c
new file mode 100644
index 0000000000000..3d63eaf0f1b87
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c
@@ -0,0 +1,5 @@
+void foo() {
+    a = 2; // check-error{{asdf}}
+           // expected-error@-1{ignored}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c.expected b/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c.expected
new file mode 100644
index 0000000000000..a877f86922123
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c.expected
@@ -0,0 +1,5 @@
+void foo() {
+    a = 2; // check-error{{use of undeclared identifier 'a'}}
+           // expected-error@-1{ignored}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/update-same-line.c b/clang/test/utils/update-verify-tests/Inputs/update-same-line.c
new file mode 100644
index 0000000000000..5278ce0c57c31
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/update-same-line.c
@@ -0,0 +1,4 @@
+void foo() {
+    bar = 2;     //   expected-error       {{asdf}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/update-same-line.c.expected b/clang/test/utils/update-verify-tests/Inputs/update-same-line.c.expected
new file mode 100644
index 0000000000000..8ba47f788319b
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/update-same-line.c.expected
@@ -0,0 +1,4 @@
+void foo() {
+    bar = 2;     //   expected-error       {{use of undeclared identifier 'bar'}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/update-single-check.c b/clang/test/utils/update-verify-tests/Inputs/update-single-check.c
new file mode 100644
index 0000000000000..20b011bfc3d77
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/update-single-check.c
@@ -0,0 +1,4 @@
+void foo() {
+    // expected-error@+1{{asdf}}
+    bar = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/update-single-check.c.expected b/clang/test/utils/update-verify-tests/Inputs/update-single-check.c.expected
new file mode 100644
index 0000000000000..e80548fbe50f2
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/update-single-check.c.expected
@@ -0,0 +1,4 @@
+void foo() {
+    // expected-error@+1{{use of undeclared identifier 'bar'}}
+    bar = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/duplicate-diag.test b/clang/test/utils/update-verify-tests/duplicate-diag.test
new file mode 100644
index 0000000000000..3163ce46199c3
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/duplicate-diag.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/duplicate-diag.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/duplicate-diag.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
+
diff --git a/clang/test/utils/update-verify-tests/infer-indentation.test b/clang/test/utils/update-verify-tests/infer-indentation.test
new file mode 100644
index 0000000000000..6ba2f5d9d505b
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/infer-indentation.test
@@ -0,0 +1,3 @@
+# RUN: cp %S/Inputs/infer-indentation.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/infer-indentation.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
diff --git a/clang/test/utils/update-verify-tests/leave-existing-diags.test b/clang/test/utils/update-verify-tests/leave-existing-diags.test
new file mode 100644
index 0000000000000..cde690ef715a6
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/leave-existing-diags.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/leave-existing-diags.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/leave-existing-diags.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
+
diff --git a/clang/test/utils/update-verify-tests/lit.local.cfg b/clang/test/utils/update-verify-tests/lit.local.cfg
new file mode 100644
index 0000000000000..a0b6afccc2501
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/lit.local.cfg
@@ -0,0 +1,25 @@
+import lit.util
+
+# python 2.7 backwards compatibility
+try:
+    from shlex import quote as shell_quote
+except ImportError:
+    from pipes import quote as shell_quote
+
+if config.standalone_build:
+    # These tests require the update-verify-tests.py script from the clang
+    # source tree, so skip these tests if we are doing standalone builds.
+    config.unsupported = True
+else:
+    config.suffixes = [".test"]
+
+    script_path = os.path.join(
+        config.clang_src_dir, "utils", "update-verify-tests.py"
+    )
+    python = shell_quote(config.python_executable)
+    config.substitutions.append(
+        (
+            "%update-verify-tests",
+            "%s %s" % (python, shell_quote(script_path)),
+        )
+    )
diff --git a/clang/test/utils/update-verify-tests/multiple-errors.test b/clang/test/utils/update-verify-tests/multiple-errors.test
new file mode 100644
index 0000000000000..1332ef365dc86
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/multiple-errors.test
@@ -0,0 +1,3 @@
+# RUN: cp %S/Inputs/multiple-errors.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/multiple-errors.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
diff --git a/clang/test/utils/update-verify-tests/multiple-missing-errors-same-line.test b/clang/test/utils/update-verify-tests/multiple-missing-errors-same-line.test
new file mode 100644
index 0000000000000..a9c21cd77e192
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/multiple-missing-errors-same-line.test
@@ -0,0 +1,3 @@
+# RUN: cp %S/Inputs/multiple-missing-errors-same-line.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/multiple-missing-errors-same-line.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
diff --git a/clang/test/utils/update-verify-tests/no-checks.test b/clang/test/utils/update-verify-tests/no-checks.test
new file mode 100644
index 0000000000000..f6ea91fa552be
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/no-checks.test
@@ -0,0 +1,3 @@
+# RUN: cp %S/Inputs/no-checks.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/no-checks.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
diff --git a/clang/test/utils/update-verify-tests/no-diags.test b/clang/test/utils/update-verify-tests/no-diags.test
new file mode 100644
index 0000000000000..464fe8894253b
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/no-diags.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/no-diags.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/no-diags.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
+
diff --git a/clang/test/utils/update-verify-tests/no-expected-diags.test b/clang/test/utils/update-verify-tests/no-expected-diags.test
new file mode 100644
index 0000000000000..75235f17a64a2
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/no-expected-diags.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/no-expected-diags.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/no-expected-diags.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
+
diff --git a/clang/test/utils/update-verify-tests/non-default-prefix.test b/clang/test/utils/update-verify-tests/non-default-prefix.test
new file mode 100644
index 0000000000000..e581755a6e603
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/non-default-prefix.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/non-default-prefix.c %t.c && not %clang_cc1 -verify=check %t.c 2>&1 | %update-verify-tests --prefix check
+# RUN: diff -u %S/Inputs/non-default-prefix.c.expected %t.c
+# RUN: %clang_cc1 -verify=check %t.c
+
diff --git a/clang/test/utils/update-verify-tests/update-same-line.test b/clang/test/utils/update-verify-tests/update-same-line.test
new file mode 100644
index 0000000000000..324768eae5faa
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/update-same-line.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/update-same-line.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/update-same-line.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
+
diff --git a/clang/test/utils/update-verify-tests/update-single-check.test b/clang/test/utils/update-verify-tests/update-single-check.test
new file mode 100644
index 0000000000000..2cb1ae3bcbd3b
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/update-single-check.test
@@ -0,0 +1,3 @@
+# RUN: cp %S/Inputs/update-single-check.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/update-single-check.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
diff --git a/clang/utils/TableGen/ClangSACheckersEmitter.cpp b/clang/utils/TableGen/ClangSACheckersEmitter.cpp
index 2a2e466ae1979..44c2d8b31655d 100644
--- a/clang/utils/TableGen/ClangSACheckersEmitter.cpp
+++ b/clang/utils/TableGen/ClangSACheckersEmitter.cpp
@@ -174,9 +174,11 @@ static void printOption(llvm::raw_ostream &OS, StringRef FullName,
     OS << "true";
 }
 
-void clang::EmitClangSACheckers(RecordKeeper &Records, raw_ostream &OS) {
-  std::vector<Record*> checkers = Records.getAllDerivedDefinitions("Checker");
-  std::vector<Record*> packages = Records.getAllDerivedDefinitions("Package");
+void clang::EmitClangSACheckers(const RecordKeeper &Records, raw_ostream &OS) {
+  ArrayRef<const Record *> checkers =
+      Records.getAllDerivedDefinitions("Checker");
+  ArrayRef<const Record *> packages =
+      Records.getAllDerivedDefinitions("Package");
 
   using SortedRecords = llvm::StringMap<const Record *>;
 
diff --git a/clang/utils/TableGen/ClangSyntaxEmitter.cpp b/clang/utils/TableGen/ClangSyntaxEmitter.cpp
index 2a69e4c353b6b..66b27be88f56f 100644
--- a/clang/utils/TableGen/ClangSyntaxEmitter.cpp
+++ b/clang/utils/TableGen/ClangSyntaxEmitter.cpp
@@ -41,11 +41,12 @@ using llvm::formatv;
 // stable and useful way, where abstract Node subclasses correspond to ranges.
 class Hierarchy {
 public:
-  Hierarchy(llvm::RecordKeeper &Records) {
-    for (llvm::Record *T : Records.getAllDerivedDefinitions("NodeType"))
+  Hierarchy(const llvm::RecordKeeper &Records) {
+    for (const llvm::Record *T : Records.getAllDerivedDefinitions("NodeType"))
       add(T);
-    for (llvm::Record *Derived : Records.getAllDerivedDefinitions("NodeType"))
-      if (llvm::Record *Base = Derived->getValueAsOptionalDef("base"))
+    for (const llvm::Record *Derived :
+         Records.getAllDerivedDefinitions("NodeType"))
+      if (const llvm::Record *Base = Derived->getValueAsOptionalDef("base"))
         link(Derived, Base);
     for (NodeType &N : AllTypes) {
       llvm::sort(N.Derived, [](const NodeType *L, const NodeType *R) {
@@ -127,7 +128,7 @@ struct SyntaxConstraint {
 
 } // namespace
 
-void clang::EmitClangSyntaxNodeList(llvm::RecordKeeper &Records,
+void clang::EmitClangSyntaxNodeList(const llvm::RecordKeeper &Records,
                                     llvm::raw_ostream &OS) {
   llvm::emitSourceFileHeader("Syntax tree node list", OS, Records);
   Hierarchy H(Records);
@@ -186,7 +187,7 @@ static void printDoc(llvm::StringRef Doc, llvm::raw_ostream &OS) {
   }
 }
 
-void clang::EmitClangSyntaxNodeClasses(llvm::RecordKeeper &Records,
+void clang::EmitClangSyntaxNodeClasses(const llvm::RecordKeeper &Records,
                                        llvm::raw_ostream &OS) {
   llvm::emitSourceFileHeader("Syntax tree node list", OS, Records);
   Hierarchy H(Records);
diff --git a/clang/utils/TableGen/ClangTypeNodesEmitter.cpp b/clang/utils/TableGen/ClangTypeNodesEmitter.cpp
index 66bdf5e67602b..41a2d0cd066fe 100644
--- a/clang/utils/TableGen/ClangTypeNodesEmitter.cpp
+++ b/clang/utils/TableGen/ClangTypeNodesEmitter.cpp
@@ -74,16 +74,15 @@ using namespace clang::tblgen;
 
 namespace {
 class TypeNodeEmitter {
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
   raw_ostream &Out;
-  const std::vector<Record*> Types;
+  ArrayRef<const Record *> Types;
   std::vector<StringRef> MacrosToUndef;
 
 public:
-  TypeNodeEmitter(RecordKeeper &records, raw_ostream &out)
-    : Records(records), Out(out),
-      Types(Records.getAllDerivedDefinitions(TypeNodeClassName)) {
-  }
+  TypeNodeEmitter(const RecordKeeper &records, raw_ostream &out)
+      : Records(records), Out(out),
+        Types(Records.getAllDerivedDefinitions(TypeNodeClassName)) {}
 
   void emit();
 
@@ -203,6 +202,6 @@ void TypeNodeEmitter::emitUndefs() {
   }
 }
 
-void clang::EmitClangTypeNodes(RecordKeeper &records, raw_ostream &out) {
+void clang::EmitClangTypeNodes(const RecordKeeper &records, raw_ostream &out) {
   TypeNodeEmitter(records, out).emit();
 }
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index bb4f091604f5e..6cfaa891241fa 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -958,7 +958,7 @@ class ACLEIntrinsic {
            ";\n";
   }
 
-  ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param);
+  ACLEIntrinsic(EmitterBase &ME, const Record *R, const Type *Param);
 };
 
 // -----------------------------------------------------------------------------
@@ -988,7 +988,7 @@ class EmitterBase {
   const ScalarType *getScalarType(StringRef Name) {
     return ScalarTypes[std::string(Name)].get();
   }
-  const ScalarType *getScalarType(Record *R) {
+  const ScalarType *getScalarType(const Record *R) {
     return getScalarType(R->getName());
   }
   const VectorType *getVectorType(const ScalarType *ST, unsigned Lanes) {
@@ -1028,7 +1028,7 @@ class EmitterBase {
   // the Params list in the Tablegen record for the intrinsic), which is used
   // to expand Tablegen classes like 'Vector' which mean something different in
   // each member of a parametric family.
-  const Type *getType(Record *R, const Type *Param);
+  const Type *getType(const Record *R, const Type *Param);
   const Type *getType(DagInit *D, const Type *Param);
   const Type *getType(Init *I, const Type *Param);
 
@@ -1046,7 +1046,7 @@ class EmitterBase {
 
   // Constructor and top-level functions.
 
-  EmitterBase(RecordKeeper &Records);
+  EmitterBase(const RecordKeeper &Records);
   virtual ~EmitterBase() = default;
 
   virtual void EmitHeader(raw_ostream &OS) = 0;
@@ -1065,7 +1065,7 @@ const Type *EmitterBase::getType(Init *I, const Type *Param) {
   PrintFatalError("Could not convert this value into a type");
 }
 
-const Type *EmitterBase::getType(Record *R, const Type *Param) {
+const Type *EmitterBase::getType(const Record *R, const Type *Param) {
   // Pass to a subfield of any wrapper records. We don't expect more than one
   // of these: immediate operands are used as plain numbers rather than as
   // llvm::Value, so it's meaningless to promote their type anyway.
@@ -1088,7 +1088,7 @@ const Type *EmitterBase::getType(DagInit *D, const Type *Param) {
   // The meat of the getType system: types in the Tablegen are represented by a
   // dag whose operators select sub-cases of this function.
 
-  Record *Op = cast<DefInit>(D->getOperator())->getDef();
+  const Record *Op = cast<DefInit>(D->getOperator())->getDef();
   if (!Op->isSubClassOf("ComplexTypeOp"))
     PrintFatalError(
         "Expected ComplexTypeOp as dag operator in type expression");
@@ -1154,7 +1154,7 @@ const Type *EmitterBase::getType(DagInit *D, const Type *Param) {
 
 Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
                                        const Type *Param) {
-  Record *Op = cast<DefInit>(D->getOperator())->getDef();
+  const Record *Op = cast<DefInit>(D->getOperator())->getDef();
 
   if (Op->getName() == "seq") {
     Result::Scope SubScope = Scope;
@@ -1211,7 +1211,7 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
   } else if (Op->getName() == "unsignedflag") {
     if (D->getNumArgs() != 1)
       PrintFatalError("unsignedflag should have exactly one argument");
-    Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
+    const Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
     if (!TypeRec->isSubClassOf("Type"))
       PrintFatalError("unsignedflag's argument should be a type");
     if (const auto *ST = dyn_cast<ScalarType>(getType(TypeRec, Param))) {
@@ -1223,7 +1223,7 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
   } else if (Op->getName() == "bitsize") {
     if (D->getNumArgs() != 1)
       PrintFatalError("bitsize should have exactly one argument");
-    Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
+    const Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
     if (!TypeRec->isSubClassOf("Type"))
       PrintFatalError("bitsize's argument should be a type");
     if (const auto *ST = dyn_cast<ScalarType>(getType(TypeRec, Param))) {
@@ -1239,7 +1239,7 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
     if (Op->isSubClassOf("IRBuilderBase")) {
       std::set<unsigned> AddressArgs;
       std::map<unsigned, std::string> IntegerArgs;
-      for (Record *sp : Op->getValueAsListOfDefs("special_params")) {
+      for (const Record *sp : Op->getValueAsListOfDefs("special_params")) {
         unsigned Index = sp->getValueAsInt("index");
         if (sp->isSubClassOf("IRBuilderAddrParam")) {
           AddressArgs.insert(Index);
@@ -1251,7 +1251,7 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
                                                Args, AddressArgs, IntegerArgs);
     } else if (Op->isSubClassOf("IRIntBase")) {
       std::vector<const Type *> ParamTypes;
-      for (Record *RParam : Op->getValueAsListOfDefs("params"))
+      for (const Record *RParam : Op->getValueAsListOfDefs("params"))
         ParamTypes.push_back(getType(RParam, Param));
       std::string IntName = std::string(Op->getValueAsString("intname"));
       if (Op->getValueAsBit("appendKind"))
@@ -1294,7 +1294,7 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum,
     return getCodeForDag(DI, Scope, Param);
 
   if (auto *DI = dyn_cast<DefInit>(Arg)) {
-    Record *Rec = DI->getDef();
+    const Record *Rec = DI->getDef();
     if (Rec->isSubClassOf("Type")) {
       const Type *T = getType(Rec, Param);
       return std::make_shared<TypeResult>(T);
@@ -1328,7 +1328,8 @@ Result::Ptr EmitterBase::getCodeForArg(unsigned ArgNum, const Type *ArgType,
   return V;
 }
 
-ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param)
+ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
+                             const Type *Param)
     : ReturnType(ME.getType(R->getValueAsDef("ret"), Param)) {
   // Derive the intrinsic's full name, by taking the name of the
   // Tablegen record (or override) and appending the suffix from its
@@ -1346,7 +1347,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param)
   // full name as specified by its 'pnt' member ('polymorphic name type'),
   // which indicates how many type suffixes to remove, and any other piece of
   // the name that should be removed.
-  Record *PolymorphicNameType = R->getValueAsDef("pnt");
+  const Record *PolymorphicNameType = R->getValueAsDef("pnt");
   SmallVector<StringRef, 8> NameParts;
   StringRef(FullName).split(NameParts, '_');
   for (unsigned i = 0, e = PolymorphicNameType->getValueAsInt(
@@ -1393,11 +1394,11 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param)
     // what values it can take, for Sema checking.
     bool Immediate = false;
     if (auto TypeDI = dyn_cast<DefInit>(TypeInit)) {
-      Record *TypeRec = TypeDI->getDef();
+      const Record *TypeRec = TypeDI->getDef();
       if (TypeRec->isSubClassOf("Immediate")) {
         Immediate = true;
 
-        Record *Bounds = TypeRec->getValueAsDef("bounds");
+        const Record *Bounds = TypeRec->getValueAsDef("bounds");
         ImmediateArg &IA = ImmediateArgs[i];
         if (Bounds->isSubClassOf("IB_ConstRange")) {
           IA.boundsType = ImmediateArg::BoundsType::ExplicitRange;
@@ -1440,7 +1441,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param)
   // Finally, go through the codegen dag and translate it into a Result object
   // (with an arbitrary DAG of depended-on Results hanging off it).
   DagInit *CodeDag = R->getValueAsDag("codegen");
-  Record *MainOp = cast<DefInit>(CodeDag->getOperator())->getDef();
+  const Record *MainOp = cast<DefInit>(CodeDag->getOperator())->getDef();
   if (MainOp->isSubClassOf("CustomCodegen")) {
     // Or, if it's the special case of CustomCodegen, just accumulate
     // a list of parameters we're going to assign to variables before
@@ -1464,7 +1465,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param)
   }
 }
 
-EmitterBase::EmitterBase(RecordKeeper &Records) {
+EmitterBase::EmitterBase(const RecordKeeper &Records) {
   // Construct the whole EmitterBase.
 
   // First, look up all the instances of PrimitiveType. This gives us the list
@@ -1472,13 +1473,13 @@ EmitterBase::EmitterBase(RecordKeeper &Records) {
   // collect all the useful ScalarType instances into a big list so that we can
   // use it for operations such as 'find the unsigned version of this signed
   // integer type'.
-  for (Record *R : Records.getAllDerivedDefinitions("PrimitiveType"))
+  for (const Record *R : Records.getAllDerivedDefinitions("PrimitiveType"))
     ScalarTypes[std::string(R->getName())] = std::make_unique<ScalarType>(R);
 
   // Now go through the instances of Intrinsic, and for each one, iterate
   // through its list of type parameters making an ACLEIntrinsic for each one.
-  for (Record *R : Records.getAllDerivedDefinitions("Intrinsic")) {
-    for (Record *RParam : R->getValueAsListOfDefs("params")) {
+  for (const Record *R : Records.getAllDerivedDefinitions("Intrinsic")) {
+    for (const Record *RParam : R->getValueAsListOfDefs("params")) {
       const Type *Param = getType(RParam, getVoidType());
       auto Intrinsic = std::make_unique<ACLEIntrinsic>(*this, R, Param);
       ACLEIntrinsics[Intrinsic->fullName()] = std::move(Intrinsic);
@@ -1752,7 +1753,7 @@ void EmitterBase::GroupSemaChecks(
 
 class MveEmitter : public EmitterBase {
 public:
-  MveEmitter(RecordKeeper &Records) : EmitterBase(Records){};
+  MveEmitter(const RecordKeeper &Records) : EmitterBase(Records) {}
   void EmitHeader(raw_ostream &OS) override;
   void EmitBuiltinDef(raw_ostream &OS) override;
   void EmitBuiltinSema(raw_ostream &OS) override;
@@ -2010,14 +2011,14 @@ class CdeEmitter : public EmitterBase {
   std::map<StringRef, FunctionMacro> FunctionMacros;
 
 public:
-  CdeEmitter(RecordKeeper &Records);
+  CdeEmitter(const RecordKeeper &Records);
   void EmitHeader(raw_ostream &OS) override;
   void EmitBuiltinDef(raw_ostream &OS) override;
   void EmitBuiltinSema(raw_ostream &OS) override;
 };
 
-CdeEmitter::CdeEmitter(RecordKeeper &Records) : EmitterBase(Records) {
-  for (Record *R : Records.getAllDerivedDefinitions("FunctionMacro"))
+CdeEmitter::CdeEmitter(const RecordKeeper &Records) : EmitterBase(Records) {
+  for (const Record *R : Records.getAllDerivedDefinitions("FunctionMacro"))
     FunctionMacros.emplace(R->getName(), FunctionMacro(*R));
 }
 
@@ -2179,45 +2180,45 @@ namespace clang {
 
 // MVE
 
-void EmitMveHeader(RecordKeeper &Records, raw_ostream &OS) {
+void EmitMveHeader(const RecordKeeper &Records, raw_ostream &OS) {
   MveEmitter(Records).EmitHeader(OS);
 }
 
-void EmitMveBuiltinDef(RecordKeeper &Records, raw_ostream &OS) {
+void EmitMveBuiltinDef(const RecordKeeper &Records, raw_ostream &OS) {
   MveEmitter(Records).EmitBuiltinDef(OS);
 }
 
-void EmitMveBuiltinSema(RecordKeeper &Records, raw_ostream &OS) {
+void EmitMveBuiltinSema(const RecordKeeper &Records, raw_ostream &OS) {
   MveEmitter(Records).EmitBuiltinSema(OS);
 }
 
-void EmitMveBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
+void EmitMveBuiltinCG(const RecordKeeper &Records, raw_ostream &OS) {
   MveEmitter(Records).EmitBuiltinCG(OS);
 }
 
-void EmitMveBuiltinAliases(RecordKeeper &Records, raw_ostream &OS) {
+void EmitMveBuiltinAliases(const RecordKeeper &Records, raw_ostream &OS) {
   MveEmitter(Records).EmitBuiltinAliases(OS);
 }
 
 // CDE
 
-void EmitCdeHeader(RecordKeeper &Records, raw_ostream &OS) {
+void EmitCdeHeader(const RecordKeeper &Records, raw_ostream &OS) {
   CdeEmitter(Records).EmitHeader(OS);
 }
 
-void EmitCdeBuiltinDef(RecordKeeper &Records, raw_ostream &OS) {
+void EmitCdeBuiltinDef(const RecordKeeper &Records, raw_ostream &OS) {
   CdeEmitter(Records).EmitBuiltinDef(OS);
 }
 
-void EmitCdeBuiltinSema(RecordKeeper &Records, raw_ostream &OS) {
+void EmitCdeBuiltinSema(const RecordKeeper &Records, raw_ostream &OS) {
   CdeEmitter(Records).EmitBuiltinSema(OS);
 }
 
-void EmitCdeBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
+void EmitCdeBuiltinCG(const RecordKeeper &Records, raw_ostream &OS) {
   CdeEmitter(Records).EmitBuiltinCG(OS);
 }
 
-void EmitCdeBuiltinAliases(RecordKeeper &Records, raw_ostream &OS) {
+void EmitCdeBuiltinAliases(const RecordKeeper &Records, raw_ostream &OS) {
   CdeEmitter(Records).EmitBuiltinAliases(OS);
 }
 
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 4707ce1ea3b79..9e5480be20ada 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -59,7 +59,7 @@ namespace {
 // While globals are generally bad, this one allows us to perform assertions
 // liberally and somehow still trace them back to the def they indirectly
 // came from.
-static Record *CurrentRecord = nullptr;
+static const Record *CurrentRecord = nullptr;
 static void assert_with_loc(bool Assertion, const std::string &Str) {
   if (!Assertion) {
     if (CurrentRecord)
@@ -308,7 +308,7 @@ class Variable {
 /// a particular typespec and prototype.
 class Intrinsic {
   /// The Record this intrinsic was created from.
-  Record *R;
+  const Record *R;
   /// The unmangled name.
   std::string Name;
   /// The input and output typespecs. InTS == OutTS except when
@@ -371,7 +371,7 @@ class Intrinsic {
   }
 
 public:
-  Intrinsic(Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS,
+  Intrinsic(const Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS,
             TypeSpec InTS, ClassKind CK, ListInit *Body, NeonEmitter &Emitter,
             StringRef ArchGuard, StringRef TargetGuard, bool IsUnavailable,
             bool BigEndianSafe)
@@ -442,7 +442,7 @@ class Intrinsic {
   }
 
   /// Get the Record that this intrinsic is based off.
-  Record *getRecord() const { return R; }
+  const Record *getRecord() const { return R; }
   /// Get the set of Intrinsics that this intrinsic calls.
   /// this is the set of immediate dependencies, NOT the
   /// transitive closure.
@@ -576,12 +576,12 @@ class Intrinsic {
 //===----------------------------------------------------------------------===//
 
 class NeonEmitter {
-  RecordKeeper &Records;
-  DenseMap<Record *, ClassKind> ClassMap;
+  const RecordKeeper &Records;
+  DenseMap<const Record *, ClassKind> ClassMap;
   std::map<std::string, std::deque<Intrinsic>> IntrinsicMap;
   unsigned UniqueNumber;
 
-  void createIntrinsic(Record *R, SmallVectorImpl<Intrinsic *> &Out);
+  void createIntrinsic(const Record *R, SmallVectorImpl<Intrinsic *> &Out);
   void genBuiltinsDef(raw_ostream &OS, SmallVectorImpl<Intrinsic *> &Defs);
   void genStreamingSVECompatibleList(raw_ostream &OS,
                                      SmallVectorImpl<Intrinsic *> &Defs);
@@ -601,15 +601,15 @@ class NeonEmitter {
   /// Called by Intrinsic - returns a globally-unique number.
   unsigned getUniqueNumber() { return UniqueNumber++; }
 
-  NeonEmitter(RecordKeeper &R) : Records(R), UniqueNumber(0) {
-    Record *SI = R.getClass("SInst");
-    Record *II = R.getClass("IInst");
-    Record *WI = R.getClass("WInst");
-    Record *SOpI = R.getClass("SOpInst");
-    Record *IOpI = R.getClass("IOpInst");
-    Record *WOpI = R.getClass("WOpInst");
-    Record *LOpI = R.getClass("LOpInst");
-    Record *NoTestOpI = R.getClass("NoTestOpInst");
+  NeonEmitter(const RecordKeeper &R) : Records(R), UniqueNumber(0) {
+    const Record *SI = R.getClass("SInst");
+    const Record *II = R.getClass("IInst");
+    const Record *WI = R.getClass("WInst");
+    const Record *SOpI = R.getClass("SOpInst");
+    const Record *IOpI = R.getClass("IOpInst");
+    const Record *WOpI = R.getClass("WOpInst");
+    const Record *LOpI = R.getClass("LOpInst");
+    const Record *NoTestOpI = R.getClass("NoTestOpInst");
 
     ClassMap[SI] = ClassS;
     ClassMap[II] = ClassI;
@@ -1979,12 +1979,12 @@ Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef<Type> Types,
   return *GoodVec.front();
 }
 
-void NeonEmitter::createIntrinsic(Record *R,
+void NeonEmitter::createIntrinsic(const Record *R,
                                   SmallVectorImpl<Intrinsic *> &Out) {
   std::string Name = std::string(R->getValueAsString("Name"));
   std::string Proto = std::string(R->getValueAsString("Prototype"));
   std::string Types = std::string(R->getValueAsString("Types"));
-  Record *OperationRec = R->getValueAsDef("Operation");
+  const Record *OperationRec = R->getValueAsDef("Operation");
   bool BigEndianSafe  = R->getValueAsBit("BigEndianSafe");
   std::string ArchGuard = std::string(R->getValueAsString("ArchGuard"));
   std::string TargetGuard = std::string(R->getValueAsString("TargetGuard"));
@@ -2240,10 +2240,8 @@ void NeonEmitter::genIntrinsicRangeCheckCode(
 /// 2. the SemaChecking code for the type overload checking.
 /// 3. the SemaChecking code for validation of intrinsic immediate arguments.
 void NeonEmitter::runHeader(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-
   SmallVector<Intrinsic *, 128> Defs;
-  for (auto *R : RV)
+  for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
     createIntrinsic(R, Defs);
 
   // Generate shared BuiltinsXXX.def
@@ -2402,8 +2400,7 @@ void NeonEmitter::run(raw_ostream &OS) {
         "__nodebug__))\n\n";
 
   SmallVector<Intrinsic *, 128> Defs;
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-  for (auto *R : RV)
+  for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
     createIntrinsic(R, Defs);
 
   for (auto *I : Defs)
@@ -2510,8 +2507,7 @@ void NeonEmitter::runFP16(raw_ostream &OS) {
         "__nodebug__))\n\n";
 
   SmallVector<Intrinsic *, 128> Defs;
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-  for (auto *R : RV)
+  for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
     createIntrinsic(R, Defs);
 
   for (auto *I : Defs)
@@ -2619,8 +2615,7 @@ void NeonEmitter::runBF16(raw_ostream &OS) {
         "__nodebug__))\n\n";
 
   SmallVector<Intrinsic *, 128> Defs;
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-  for (auto *R : RV)
+  for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
     createIntrinsic(R, Defs);
 
   for (auto *I : Defs)
@@ -2674,26 +2669,26 @@ void NeonEmitter::runBF16(raw_ostream &OS) {
   OS << "#endif\n";
 }
 
-void clang::EmitNeon(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitNeon(const RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).run(OS);
 }
 
-void clang::EmitFP16(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitFP16(const RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).runFP16(OS);
 }
 
-void clang::EmitBF16(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitBF16(const RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).runBF16(OS);
 }
 
-void clang::EmitNeonSema(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitNeonSema(const RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).runHeader(OS);
 }
 
-void clang::EmitVectorTypes(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitVectorTypes(const RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).runVectorTypes(OS);
 }
 
-void clang::EmitNeonTest(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitNeonTest(const RecordKeeper &Records, raw_ostream &OS) {
   llvm_unreachable("Neon test generation no longer implemented!");
 }
diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index d05236bb4e909..4ef83e7b608dc 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -95,11 +95,11 @@ class SemaSignatureTable {
 
 class RVVEmitter {
 private:
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
   RVVTypeCache TypeCache;
 
 public:
-  RVVEmitter(RecordKeeper &R) : Records(R) {}
+  RVVEmitter(const RecordKeeper &R) : Records(R) {}
 
   /// Emit riscv_vector.h
   void createHeader(raw_ostream &o);
@@ -554,8 +554,7 @@ void RVVEmitter::createCodeGen(raw_ostream &OS) {
 void RVVEmitter::createRVVIntrinsics(
     std::vector<std::unique_ptr<RVVIntrinsic>> &Out,
     std::vector<SemaRecord> *SemaRecords) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("RVVBuiltin");
-  for (auto *R : RV) {
+  for (const Record *R : Records.getAllDerivedDefinitions("RVVBuiltin")) {
     StringRef Name = R->getValueAsString("Name");
     StringRef SuffixProto = R->getValueAsString("Suffix");
     StringRef OverloadedName = R->getValueAsString("OverloadedName");
@@ -565,10 +564,10 @@ void RVVEmitter::createRVVIntrinsics(
     bool HasMasked = R->getValueAsBit("HasMasked");
     bool HasMaskedOffOperand = R->getValueAsBit("HasMaskedOffOperand");
     bool HasVL = R->getValueAsBit("HasVL");
-    Record *MPSRecord = R->getValueAsDef("MaskedPolicyScheme");
+    const Record *MPSRecord = R->getValueAsDef("MaskedPolicyScheme");
     auto MaskedPolicyScheme =
         static_cast<PolicyScheme>(MPSRecord->getValueAsInt("Value"));
-    Record *UMPSRecord = R->getValueAsDef("UnMaskedPolicyScheme");
+    const Record *UMPSRecord = R->getValueAsDef("UnMaskedPolicyScheme");
     auto UnMaskedPolicyScheme =
         static_cast<PolicyScheme>(UMPSRecord->getValueAsInt("Value"));
     std::vector<int64_t> Log2LMULList = R->getValueAsListOfInts("Log2LMUL");
@@ -752,9 +751,7 @@ void RVVEmitter::createRVVIntrinsics(
 }
 
 void RVVEmitter::printHeaderCode(raw_ostream &OS) {
-  std::vector<Record *> RVVHeaders =
-      Records.getAllDerivedDefinitions("RVVHeader");
-  for (auto *R : RVVHeaders) {
+  for (const Record *R : Records.getAllDerivedDefinitions("RVVHeader")) {
     StringRef HeaderCodeStr = R->getValueAsString("HeaderCode");
     OS << HeaderCodeStr.str();
   }
@@ -822,19 +819,19 @@ void RVVEmitter::createSema(raw_ostream &OS) {
 }
 
 namespace clang {
-void EmitRVVHeader(RecordKeeper &Records, raw_ostream &OS) {
+void EmitRVVHeader(const RecordKeeper &Records, raw_ostream &OS) {
   RVVEmitter(Records).createHeader(OS);
 }
 
-void EmitRVVBuiltins(RecordKeeper &Records, raw_ostream &OS) {
+void EmitRVVBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
   RVVEmitter(Records).createBuiltins(OS);
 }
 
-void EmitRVVBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
+void EmitRVVBuiltinCG(const RecordKeeper &Records, raw_ostream &OS) {
   RVVEmitter(Records).createCodeGen(OS);
 }
 
-void EmitRVVBuiltinSema(RecordKeeper &Records, raw_ostream &OS) {
+void EmitRVVBuiltinSema(const RecordKeeper &Records, raw_ostream &OS) {
   RVVEmitter(Records).createSema(OS);
 }
 
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index b2e2db1a40990..5abf6fc49bc30 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -280,7 +280,7 @@ class SVEEmitter {
 
   static const std::array<ReinterpretTypeInfo, 12> Reinterprets;
 
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
   llvm::StringMap<uint64_t> EltTypes;
   llvm::StringMap<uint64_t> MemEltTypes;
   llvm::StringMap<uint64_t> FlagTypes;
@@ -288,7 +288,7 @@ class SVEEmitter {
   llvm::StringMap<uint64_t> ImmCheckTypes;
 
 public:
-  SVEEmitter(RecordKeeper &R) : Records(R) {
+  SVEEmitter(const RecordKeeper &R) : Records(R) {
     for (auto *RV : Records.getAllDerivedDefinitions("EltType"))
       EltTypes[RV->getNameInitAsString()] = RV->getValueAsInt("Value");
     for (auto *RV : Records.getAllDerivedDefinitions("MemEltType"))
@@ -397,7 +397,7 @@ class SVEEmitter {
   void createBuiltinZAState(raw_ostream &OS);
 
   /// Create intrinsic and add it to \p Out
-  void createIntrinsic(Record *R,
+  void createIntrinsic(const Record *R,
                        SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out);
 };
 
@@ -1151,7 +1151,7 @@ uint64_t SVEEmitter::encodeTypeFlags(const SVEType &T) {
 }
 
 void SVEEmitter::createIntrinsic(
-    Record *R, SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out) {
+    const Record *R, SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out) {
   StringRef Name = R->getValueAsString("Name");
   StringRef Proto = R->getValueAsString("Prototype");
   StringRef Types = R->getValueAsString("Types");
@@ -1225,7 +1225,7 @@ void SVEEmitter::createCoreHeaderIntrinsics(raw_ostream &OS,
                                             SVEEmitter &Emitter,
                                             ACLEKind Kind) {
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   for (auto *R : RV)
     createIntrinsic(R, Defs);
 
@@ -1427,7 +1427,7 @@ void SVEEmitter::createHeader(raw_ostream &OS) {
 }
 
 void SVEEmitter::createBuiltins(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV)
     createIntrinsic(R, Defs);
@@ -1469,7 +1469,7 @@ void SVEEmitter::createBuiltins(raw_ostream &OS) {
 }
 
 void SVEEmitter::createCodeGenMap(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV)
     createIntrinsic(R, Defs);
@@ -1502,7 +1502,7 @@ void SVEEmitter::createCodeGenMap(raw_ostream &OS) {
 }
 
 void SVEEmitter::createRangeChecks(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV)
     createIntrinsic(R, Defs);
@@ -1634,7 +1634,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
 }
 
 void SVEEmitter::createSMEBuiltins(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV) {
     createIntrinsic(R, Defs);
@@ -1662,7 +1662,7 @@ void SVEEmitter::createSMEBuiltins(raw_ostream &OS) {
 }
 
 void SVEEmitter::createSMECodeGenMap(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV) {
     createIntrinsic(R, Defs);
@@ -1696,7 +1696,7 @@ void SVEEmitter::createSMECodeGenMap(raw_ostream &OS) {
 }
 
 void SVEEmitter::createSMERangeChecks(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV) {
     createIntrinsic(R, Defs);
@@ -1733,7 +1733,7 @@ void SVEEmitter::createSMERangeChecks(raw_ostream &OS) {
 }
 
 void SVEEmitter::createBuiltinZAState(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV)
     createIntrinsic(R, Defs);
@@ -1773,7 +1773,7 @@ void SVEEmitter::createBuiltinZAState(raw_ostream &OS) {
 }
 
 void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV)
     createIntrinsic(R, Defs);
@@ -1826,55 +1826,55 @@ void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) {
 }
 
 namespace clang {
-void EmitSveHeader(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveHeader(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createHeader(OS);
 }
 
-void EmitSveBuiltins(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createBuiltins(OS);
 }
 
-void EmitSveBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveBuiltinCG(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createCodeGenMap(OS);
 }
 
-void EmitSveRangeChecks(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveRangeChecks(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createRangeChecks(OS);
 }
 
-void EmitSveTypeFlags(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveTypeFlags(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createTypeFlags(OS);
 }
 
-void EmitImmCheckTypes(RecordKeeper &Records, raw_ostream &OS) {
+void EmitImmCheckTypes(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createImmCheckTypes(OS);
 }
 
-void EmitSveStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveStreamingAttrs(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SVE);
 }
 
-void EmitSmeHeader(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeHeader(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createSMEHeader(OS);
 }
 
-void EmitSmeBuiltins(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createSMEBuiltins(OS);
 }
 
-void EmitSmeBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeBuiltinCG(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createSMECodeGenMap(OS);
 }
 
-void EmitSmeRangeChecks(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeRangeChecks(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createSMERangeChecks(OS);
 }
 
-void EmitSmeStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeStreamingAttrs(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SME);
 }
 
-void EmitSmeBuiltinZAState(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeBuiltinZAState(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createBuiltinZAState(OS);
 }
 } // End namespace clang
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 01d16d2dc3e5f..f7527ac535a87 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -39,7 +39,8 @@ void EmitClangBasicReader(const llvm::RecordKeeper &Records,
                           llvm::raw_ostream &OS);
 void EmitClangBasicWriter(const llvm::RecordKeeper &Records,
                           llvm::raw_ostream &OS);
-void EmitClangTypeNodes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitClangTypeNodes(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
 void EmitClangTypeReader(const llvm::RecordKeeper &Records,
                          llvm::raw_ostream &OS);
 void EmitClangTypeWriter(const llvm::RecordKeeper &Records,
@@ -93,7 +94,8 @@ void EmitClangDiagGroups(const llvm::RecordKeeper &Records,
 void EmitClangDiagsIndexName(const llvm::RecordKeeper &Records,
                              llvm::raw_ostream &OS);
 
-void EmitClangSACheckers(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitClangSACheckers(const llvm::RecordKeeper &Records,
+                         llvm::raw_ostream &OS);
 
 void EmitClangCommentHTMLTags(const llvm::RecordKeeper &Records,
                               llvm::raw_ostream &OS);
@@ -108,49 +110,62 @@ void EmitClangCommentCommandList(const llvm::RecordKeeper &Records,
                                  llvm::raw_ostream &OS);
 void EmitClangOpcodes(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
-void EmitClangSyntaxNodeList(llvm::RecordKeeper &Records,
+void EmitClangSyntaxNodeList(const llvm::RecordKeeper &Records,
                              llvm::raw_ostream &OS);
-void EmitClangSyntaxNodeClasses(llvm::RecordKeeper &Records,
+void EmitClangSyntaxNodeClasses(const llvm::RecordKeeper &Records,
                                 llvm::raw_ostream &OS);
 
-void EmitNeon(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitFP16(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitBF16(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitNeonSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitVectorTypes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitNeonTest(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-
-void EmitImmCheckTypes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveTypeFlags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-
-void EmitSmeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSmeBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSmeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSmeRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSmeStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSmeBuiltinZAState(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-
-void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitMveBuiltinSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitMveBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitMveBuiltinAliases(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-
-void EmitRVVHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitRVVBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitRVVBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitRVVBuiltinSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-
-void EmitCdeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitCdeBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitCdeBuiltinSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitCdeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitCdeBuiltinAliases(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitNeon(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitFP16(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitBF16(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitNeonSema(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitVectorTypes(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitNeonTest(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+
+void EmitImmCheckTypes(const llvm::RecordKeeper &Records,
+                       llvm::raw_ostream &OS);
+void EmitSveHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSveBuiltins(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSveBuiltinCG(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSveTypeFlags(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSveRangeChecks(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
+void EmitSveStreamingAttrs(const llvm::RecordKeeper &Records,
+                           llvm::raw_ostream &OS);
+
+void EmitSmeHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSmeBuiltins(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSmeBuiltinCG(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSmeRangeChecks(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
+void EmitSmeStreamingAttrs(const llvm::RecordKeeper &Records,
+                           llvm::raw_ostream &OS);
+void EmitSmeBuiltinZAState(const llvm::RecordKeeper &Records,
+                           llvm::raw_ostream &OS);
+
+void EmitMveHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitMveBuiltinDef(const llvm::RecordKeeper &Records,
+                       llvm::raw_ostream &OS);
+void EmitMveBuiltinSema(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
+void EmitMveBuiltinCG(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitMveBuiltinAliases(const llvm::RecordKeeper &Records,
+                           llvm::raw_ostream &OS);
+
+void EmitRVVHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitRVVBuiltins(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitRVVBuiltinCG(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitRVVBuiltinSema(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
+
+void EmitCdeHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitCdeBuiltinDef(const llvm::RecordKeeper &Records,
+                       llvm::raw_ostream &OS);
+void EmitCdeBuiltinSema(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
+void EmitCdeBuiltinCG(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitCdeBuiltinAliases(const llvm::RecordKeeper &Records,
+                           llvm::raw_ostream &OS);
 
 void EmitClangAttrDocs(const llvm::RecordKeeper &Records,
                        llvm::raw_ostream &OS);
diff --git a/clang/utils/UpdateVerifyTests/core.py b/clang/utils/UpdateVerifyTests/core.py
new file mode 100644
index 0000000000000..d1350cdbb698b
--- /dev/null
+++ b/clang/utils/UpdateVerifyTests/core.py
@@ -0,0 +1,452 @@
+import sys
+import re
+
+DEBUG = False
+
+
+def dprint(*args):
+    if DEBUG:
+        print(*args, file=sys.stderr)
+
+
+class KnownException(Exception):
+    pass
+
+
+def parse_error_category(s, prefix):
+    if "no expected directives found" in s:
+        return None
+    parts = s.split("diagnostics")
+    diag_category = parts[0]
+    category_parts = parts[0].strip().strip("'").split("-")
+    expected = category_parts[0]
+    if expected != prefix:
+        raise Exception(
+            f"expected prefix '{prefix}', but found '{expected}'. Multiple verify prefixes are not supported."
+        )
+    diag_category = category_parts[1]
+    if "seen but not expected" in parts[1]:
+        seen = True
+    elif "expected but not seen" in parts[1]:
+        seen = False
+    else:
+        raise KnownException(f"unexpected category '{parts[1]}'")
+    return (diag_category, seen)
+
+
+diag_error_re = re.compile(r"File (\S+) Line (\d+): (.+)")
+diag_error_re2 = re.compile(r"File \S+ Line \d+ \(directive at (\S+):(\d+)\): (.+)")
+
+
+def parse_diag_error(s):
+    m = diag_error_re2.match(s)
+    if not m:
+        m = diag_error_re.match(s)
+    if not m:
+        return None
+    return (m.group(1), int(m.group(2)), m.group(3))
+
+
+class Line:
+    def __init__(self, content, line_n):
+        self.content = content
+        self.diag = None
+        self.line_n = line_n
+        self.targeting_diags = []
+
+    def update_line_n(self, n):
+        self.line_n = n
+
+    def render(self):
+        if not self.diag:
+            return self.content
+        assert "{{DIAG}}" in self.content
+        res = self.content.replace("{{DIAG}}", self.diag.render())
+        if not res.strip():
+            return ""
+        return res
+
+
+class Diag:
+    def __init__(
+        self,
+        prefix,
+        diag_content,
+        category,
+        parsed_target_line_n,
+        line_is_absolute,
+        count,
+        line,
+        is_re,
+        whitespace_strings,
+        is_from_source_file,
+    ):
+        self.prefix = prefix
+        self.diag_content = diag_content
+        self.category = category
+        self.parsed_target_line_n = parsed_target_line_n
+        self.line_is_absolute = line_is_absolute
+        self.count = count
+        self.line = line
+        self.target = None
+        self.is_re = is_re
+        self.absolute_target()
+        self.whitespace_strings = whitespace_strings
+        self.is_from_source_file = is_from_source_file
+
+    def decrement_count(self):
+        self.count -= 1
+        assert self.count >= 0
+
+    def increment_count(self):
+        assert self.count >= 0
+        self.count += 1
+
+    def unset_target(self):
+        assert self.target is not None
+        self.target.targeting_diags.remove(self)
+        self.target = None
+
+    def set_target(self, target):
+        if self.target:
+            self.unset_target()
+        self.target = target
+        self.target.targeting_diags.append(self)
+
+    def absolute_target(self):
+        if self.target:
+            return self.target.line_n
+        if self.line_is_absolute:
+            return self.parsed_target_line_n
+        return self.line.line_n + self.parsed_target_line_n
+
+    def relative_target(self):
+        return self.absolute_target() - self.line.line_n
+
+    def take(self, other_diag):
+        assert self.count == 0
+        assert other_diag.count > 0
+        assert other_diag.target == self.target
+        assert not other_diag.line_is_absolute
+        assert not other_diag.is_re and not self.is_re
+        self.line_is_absolute = False
+        self.diag_content = other_diag.diag_content
+        self.count = other_diag.count
+        self.category = other_diag.category
+        self.count = other_diag.count
+        other_diag.count = 0
+
+    def render(self):
+        assert self.count >= 0
+        if self.count == 0:
+            return ""
+        line_location_s = ""
+        if self.relative_target() != 0:
+            if self.line_is_absolute:
+                line_location_s = f"@{self.absolute_target()}"
+            elif self.relative_target() > 0:
+                line_location_s = f"@+{self.relative_target()}"
+            else:
+                line_location_s = (
+                    f"@{self.relative_target()}"  # the minus sign is implicit
+                )
+        count_s = "" if self.count == 1 else f"{self.count}"
+        re_s = "-re" if self.is_re else ""
+        if self.whitespace_strings:
+            whitespace1_s = self.whitespace_strings[0]
+            whitespace2_s = self.whitespace_strings[1]
+            whitespace3_s = self.whitespace_strings[2]
+        else:
+            whitespace1_s = " "
+            whitespace2_s = ""
+            whitespace3_s = ""
+        if count_s and not whitespace2_s:
+            whitespace2_s = " "  # required to parse correctly
+        elif not count_s and whitespace2_s == " ":
+            """Don't emit a weird extra space.
+            However if the whitespace is something other than the
+            standard single space, let it be to avoid disrupting manual formatting.
+            The existence of a non-empty whitespace2_s implies this was parsed with
+            a count > 1 and then decremented, otherwise this whitespace would have
+            been parsed as whitespace3_s.
+            """
+            whitespace2_s = ""
+        return f"//{whitespace1_s}{self.prefix}-{self.category}{re_s}{line_location_s}{whitespace2_s}{count_s}{whitespace3_s}{{{{{self.diag_content}}}}}"
+
+
+expected_diag_re = re.compile(
+    r"//(\s*)([a-zA-Z]+)-(note|warning|error)(-re)?(@[+-]?\d+)?(?:(\s*)(\d+))?(\s*)\{\{(.*)\}\}"
+)
+
+
+def parse_diag(line, filename, lines, prefix):
+    s = line.content
+    ms = expected_diag_re.findall(s)
+    if not ms:
+        return None
+    if len(ms) > 1:
+        raise KnownException(
+            f"multiple diags on line {filename}:{line.line_n}. Aborting due to missing implementation."
+        )
+    [
+        whitespace1_s,
+        check_prefix,
+        category_s,
+        re_s,
+        target_line_s,
+        whitespace2_s,
+        count_s,
+        whitespace3_s,
+        diag_s,
+    ] = ms[0]
+    if check_prefix != prefix:
+        return None
+    if not target_line_s:
+        target_line_n = 0
+        is_absolute = False
+    elif target_line_s.startswith("@+"):
+        target_line_n = int(target_line_s[2:])
+        is_absolute = False
+    elif target_line_s.startswith("@-"):
+        target_line_n = int(target_line_s[1:])
+        is_absolute = False
+    else:
+        target_line_n = int(target_line_s[1:])
+        is_absolute = True
+    count = int(count_s) if count_s else 1
+    line.content = expected_diag_re.sub("{{DIAG}}", s)
+
+    return Diag(
+        prefix,
+        diag_s,
+        category_s,
+        target_line_n,
+        is_absolute,
+        count,
+        line,
+        bool(re_s),
+        [whitespace1_s, whitespace2_s, whitespace3_s],
+        True,
+    )
+
+
+def add_line(new_line, lines):
+    lines.insert(new_line.line_n - 1, new_line)
+    for i in range(new_line.line_n, len(lines)):
+        line = lines[i]
+        assert line.line_n == i
+        line.update_line_n(i + 1)
+    assert all(line.line_n == i + 1 for i, line in enumerate(lines))
+
+
+def remove_line(old_line, lines):
+    lines.remove(old_line)
+    for i in range(old_line.line_n - 1, len(lines)):
+        line = lines[i]
+        assert line.line_n == i + 2
+        line.update_line_n(i + 1)
+    assert all(line.line_n == i + 1 for i, line in enumerate(lines))
+
+
+indent_re = re.compile(r"\s*")
+
+
+def get_indent(s):
+    return indent_re.match(s).group(0)
+
+
+def orig_line_n_to_new_line_n(line_n, orig_lines):
+    return orig_lines[line_n - 1].line_n
+
+
+def add_diag(orig_line_n, diag_s, diag_category, lines, orig_lines, prefix):
+    line_n = orig_line_n_to_new_line_n(orig_line_n, orig_lines)
+    target = lines[line_n - 1]
+    for other in target.targeting_diags:
+        if other.is_re:
+            raise KnownException(
+                "mismatching diag on line with regex matcher. Skipping due to missing implementation"
+            )
+    reverse = (
+        True
+        if [other for other in target.targeting_diags if other.relative_target() < 0]
+        else False
+    )
+
+    targeting = [
+        other for other in target.targeting_diags if not other.line_is_absolute
+    ]
+    targeting.sort(reverse=reverse, key=lambda d: d.relative_target())
+    prev_offset = 0
+    prev_line = target
+    direction = -1 if reverse else 1
+    for d in targeting:
+        if d.relative_target() != prev_offset + direction:
+            break
+        prev_offset = d.relative_target()
+        prev_line = d.line
+    total_offset = prev_offset - 1 if reverse else prev_offset + 1
+    if reverse:
+        new_line_n = prev_line.line_n + 1
+    else:
+        new_line_n = prev_line.line_n
+    assert new_line_n == line_n + (not reverse) - total_offset
+
+    new_line = Line(get_indent(prev_line.content) + "{{DIAG}}\n", new_line_n)
+    add_line(new_line, lines)
+
+    whitespace_strings = prev_line.diag.whitespace_strings if prev_line.diag else None
+    new_diag = Diag(
+        prefix,
+        diag_s,
+        diag_category,
+        total_offset,
+        False,
+        1,
+        new_line,
+        False,
+        whitespace_strings,
+        False,
+    )
+    new_line.diag = new_diag
+    new_diag.set_target(target)
+
+
+def remove_dead_diags(lines):
+    for line in lines:
+        if not line.diag or line.diag.count != 0:
+            continue
+        if line.render() == "":
+            remove_line(line, lines)
+        else:
+            assert line.diag.is_from_source_file
+            for other_diag in line.targeting_diags:
+                if (
+                    other_diag.is_from_source_file
+                    or other_diag.count == 0
+                    or other_diag.category != line.diag.category
+                ):
+                    continue
+                if other_diag.is_re or line.diag.is_re:
+                    continue
+                line.diag.take(other_diag)
+                remove_line(other_diag.line, lines)
+
+
+def has_live_diags(lines):
+    for line in lines:
+        if line.diag and line.diag.count > 0:
+            return True
+    return False
+
+
+def get_expected_no_diags_line_n(lines, prefix):
+    for line in lines:
+        if f"{prefix}-no-diagnostics" in line.content:
+            return line.line_n
+    return None
+
+
+def update_test_file(filename, diag_errors, prefix, updated_test_files):
+    dprint(f"updating test file {filename}")
+    if filename in updated_test_files:
+        raise KnownException(f"{filename} already updated, but got new output")
+    else:
+        updated_test_files.add(filename)
+    with open(filename, "r") as f:
+        lines = [Line(line, i + 1) for i, line in enumerate(f.readlines())]
+    orig_lines = list(lines)
+    expected_no_diags_line_n = get_expected_no_diags_line_n(orig_lines, prefix)
+
+    for line in lines:
+        diag = parse_diag(line, filename, lines, prefix)
+        if diag:
+            line.diag = diag
+            diag.set_target(lines[diag.absolute_target() - 1])
+
+    for line_n, diag_s, diag_category, seen in diag_errors:
+        if seen:
+            continue
+        # this is a diagnostic expected but not seen
+        assert lines[line_n - 1].diag
+        if diag_s != lines[line_n - 1].diag.diag_content:
+            raise KnownException(
+                f"{filename}:{line_n} - found diag {lines[line_n - 1].diag.diag_content} but expected {diag_s}"
+            )
+        if diag_category != lines[line_n - 1].diag.category:
+            raise KnownException(
+                f"{filename}:{line_n} - found {lines[line_n - 1].diag.category} diag but expected {diag_category}"
+            )
+        lines[line_n - 1].diag.decrement_count()
+    diag_errors_left = []
+    diag_errors.sort(reverse=True, key=lambda t: t[0])
+    for line_n, diag_s, diag_category, seen in diag_errors:
+        if not seen:
+            continue
+        target = orig_lines[line_n - 1]
+        other_diags = [
+            d
+            for d in target.targeting_diags
+            if d.diag_content == diag_s and d.category == diag_category
+        ]
+        other_diag = other_diags[0] if other_diags else None
+        if other_diag:
+            other_diag.increment_count()
+        else:
+            add_diag(line_n, diag_s, diag_category, lines, orig_lines, prefix)
+    remove_dead_diags(lines)
+    has_diags = has_live_diags(lines)
+    with open(filename, "w") as f:
+        if not has_diags and expected_no_diags_line_n is None:
+            f.write("// expected-no-diagnostics\n")
+        for line in lines:
+            if has_diags and line.line_n == expected_no_diags_line_n:
+                continue
+            f.write(line.render())
+
+
+def update_test_files(errors, prefix):
+    errors_by_file = {}
+    for (filename, line, diag_s), (diag_category, seen) in errors:
+        if filename not in errors_by_file:
+            errors_by_file[filename] = []
+        errors_by_file[filename].append((line, diag_s, diag_category, seen))
+    updated_test_files = set()
+    for filename, diag_errors in errors_by_file.items():
+        try:
+            update_test_file(filename, diag_errors, prefix, updated_test_files)
+        except KnownException as e:
+            return f"Error in update-verify-tests while updating {filename}: {e}"
+    updated_files = list(updated_test_files)
+    assert updated_files
+    if len(updated_files) == 1:
+        return f"updated file {updated_files[0]}"
+    updated_files_s = "\n\t".join(updated_files)
+    return "updated files:\n\t{updated_files_s}"
+
+
+def check_expectations(tool_output, prefix):
+    """
+    The entry point function.
+    Called by the stand-alone update-verify-tests.py as well as litplugin.py.
+    """
+    curr = []
+    curr_category = None
+    try:
+        for line in tool_output:
+            if line.startswith("error: "):
+                curr_category = parse_error_category(line[len("error: ") :], prefix)
+                continue
+
+            diag_error = parse_diag_error(line.strip())
+            if diag_error:
+                curr.append((diag_error, curr_category))
+            else:
+                dprint("no match")
+                dprint(line.strip())
+    except KnownException as e:
+        return f"Error in update-verify-tests while parsing tool output: {e}"
+    if curr:
+        return update_test_files(curr, prefix)
+    else:
+        return "no mismatching diagnostics found"
diff --git a/clang/utils/update-verify-tests.py b/clang/utils/update-verify-tests.py
new file mode 100644
index 0000000000000..e2874a8c049ef
--- /dev/null
+++ b/clang/utils/update-verify-tests.py
@@ -0,0 +1,38 @@
+import sys
+import argparse
+from UpdateVerifyTests.core import check_expectations
+
+"""
+ Pipe output from clang's -verify into this script to have the test case updated to expect the actual diagnostic output.
+ When inserting new expected-* checks it will place them on the line before the location of the diagnostic, with an @+1,
+ or @+N for some N if there are multiple diagnostics emitted on the same line. If the current checks are using @-N for
+ this line, the new check will follow that convention also.
+ Existing checks will be left untouched as much as possible, including their location and whitespace content, to minimize
+ diffs. If inaccurate their count will be updated, or the check removed entirely.
+
+ Missing features:
+  - multiple prefixes on the same line (-verify=my-prefix,my-other-prefix)
+  - multiple prefixes on separate RUN lines (RUN: -verify=my-prefix\nRUN: -verify my-other-prefix)
+  - regexes with expected-*-re: existing ones will be left untouched if accurate, but the script will abort if there are any
+    diagnostic mismatches on the same line.
+  - multiple checks targeting the same line are supported, but a line may only contain one check
+  - if multiple checks targeting the same line are failing the script is not guaranteed to produce a minimal diff
+
+Example usage:
+  clang -verify [file] | python3 update-verify-tests.py
+  clang -verify=check [file] | python3 update-verify-tests.py --prefix check
+"""
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--prefix", default="expected", help="The prefix passed to -verify"
+    )
+    args = parser.parse_args()
+    output = check_expectations(sys.stdin.readlines(), args.prefix)
+    print(output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index 069defc970190..dbe6094541f63 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -63,6 +63,7 @@ enum ProcessorTypes {
   INTEL_SIERRAFOREST,
   INTEL_GRANDRIDGE,
   INTEL_CLEARWATERFOREST,
+  AMDFAM1AH,
   CPU_TYPE_MAX
 };
 
@@ -101,6 +102,7 @@ enum ProcessorSubtypes {
   INTEL_COREI7_ARROWLAKE,
   INTEL_COREI7_ARROWLAKE_S,
   INTEL_COREI7_PANTHERLAKE,
+  AMDFAM1AH_ZNVER5,
   CPU_SUBTYPE_MAX
 };
 
@@ -748,6 +750,24 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
       break; //  "znver4"
     }
     break; // family 19h
+  case 26:
+    CPU = "znver5";
+    *Type = AMDFAM1AH;
+    if (Model <= 0x77) {
+      // Models 00h-0Fh (Breithorn).
+      // Models 10h-1Fh (Breithorn-Dense).
+      // Models 20h-2Fh (Strix 1).
+      // Models 30h-37h (Strix 2).
+      // Models 38h-3Fh (Strix 3).
+      // Models 40h-4Fh (Granite Ridge).
+      // Models 50h-5Fh (Weisshorn).
+      // Models 60h-6Fh (Krackan1).
+      // Models 70h-77h (Sarlak).
+      CPU = "znver5";
+      *Subtype = AMDFAM1AH_ZNVER5;
+      break; //  "znver5"
+    }
+    break;
   default:
     break; // Unknown AMD CPU.
   }
diff --git a/libc/src/__support/OSUtil/linux/vdso_sym.h b/libc/src/__support/OSUtil/linux/vdso_sym.h
index eb5f204a82f30..968e1536c4d27 100644
--- a/libc/src/__support/OSUtil/linux/vdso_sym.h
+++ b/libc/src/__support/OSUtil/linux/vdso_sym.h
@@ -44,8 +44,8 @@ template <VDSOSym sym> LIBC_INLINE constexpr auto dispatcher() {
   else if constexpr (sym == VDSOSym::ClockGetTime64)
     return static_cast<int (*)(clockid_t, __kernel_timespec *)>(nullptr);
   else if constexpr (sym == VDSOSym::GetTimeOfDay)
-    return static_cast<int (*)(timeval *__restrict, timezone *__restrict)>(
-        nullptr);
+    return static_cast<int (*)(timeval *__restrict,
+                               struct timezone *__restrict)>(nullptr);
   else if constexpr (sym == VDSOSym::GetCpu)
     return static_cast<int (*)(unsigned *, unsigned *, getcpu_cache *)>(
         nullptr);
diff --git a/libc/src/__support/time/linux/CMakeLists.txt b/libc/src/__support/time/linux/CMakeLists.txt
index 1b41c7cb0a98a..f038cb8854b9b 100644
--- a/libc/src/__support/time/linux/CMakeLists.txt
+++ b/libc/src/__support/time/linux/CMakeLists.txt
@@ -9,6 +9,7 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.error_or
     libc.src.__support.OSUtil.osutil
+    libc.src.__support.OSUtil.linux.vdso
 )
 
 add_header_library(
diff --git a/libc/src/__support/time/linux/clock_gettime.h b/libc/src/__support/time/linux/clock_gettime.h
index eca1ba70de592..517cca91391a7 100644
--- a/libc/src/__support/time/linux/clock_gettime.h
+++ b/libc/src/__support/time/linux/clock_gettime.h
@@ -11,26 +11,47 @@
 
 #include "hdr/types/clockid_t.h"
 #include "hdr/types/struct_timespec.h"
+#include "src/__support/OSUtil/linux/vdso.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 #include <sys/syscall.h>
 
+#if defined(SYS_clock_gettime64)
+#include <linux/time_types.h>
+#endif
+
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 LIBC_INLINE ErrorOr<int> clock_gettime(clockid_t clockid, timespec *ts) {
-#if SYS_clock_gettime
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_gettime,
-                                              static_cast<long>(clockid),
-                                              reinterpret_cast<long>(ts));
+  using namespace vdso;
+  int ret;
+#if defined(SYS_clock_gettime)
+  TypedSymbol<VDSOSym::ClockGetTime> clock_gettime;
+  if (LIBC_LIKELY(clock_gettime != nullptr))
+    ret = clock_gettime(clockid, ts);
+  else
+    ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_gettime,
+                                            static_cast<long>(clockid),
+                                            reinterpret_cast<long>(ts));
 #elif defined(SYS_clock_gettime64)
   static_assert(
       sizeof(time_t) == sizeof(int64_t),
       "SYS_clock_gettime64 requires struct timespec with 64-bit members.");
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_gettime64,
-                                              static_cast<long>(clockid),
-                                              reinterpret_cast<long>(ts));
+
+  TypedSymbol<VDSOSym::ClockGetTime64> clock_gettime64;
+  __kernel_timespec ts64{};
+  if (LIBC_LIKELY(clock_gettime64 != nullptr))
+    ret = clock_gettime64(clockid, &ts64);
+  else
+    ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_gettime64,
+                                            static_cast<long>(clockid),
+                                            reinterpret_cast<long>(&ts64));
+  if (ret == 0) {
+    ts->tv_sec = static_cast<decltype(ts->tv_sec)>(ts64.tv_sec);
+    ts->tv_nsec = static_cast<decltype(ts->tv_nsec)>(ts64.tv_nsec);
+  }
 #else
 #error "SYS_clock_gettime and SYS_clock_gettime64 syscalls not available."
 #endif
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index b3c26933a9c2a..d449c9d39c3b3 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -152,7 +152,7 @@
 "`P1855R0 <https://wg21.link/P1855R0>`__","Make ``<compare>``\  freestanding","2019-11 (Belfast)","","",""
 "`P1862R1 <https://wg21.link/P1862R1>`__","Ranges adaptors for non-copyable iterators","2019-11 (Belfast)","|Complete|","16.0",""
 "`P1865R1 <https://wg21.link/P1865R1>`__","Add max() to latch and barrier","2019-11 (Belfast)","|Complete|","11.0",""
-"`P1869R1 <https://wg21.link/P1869R1>`__","Rename 'condition_variable_any' interruptible wait methods","2019-11 (Belfast)","","",""
+"`P1869R1 <https://wg21.link/P1869R1>`__","Rename 'condition_variable_any' interruptible wait methods","2019-11 (Belfast)","|Complete|","18.0",""
 "`P1870R1 <https://wg21.link/P1870R1>`__","forwarding-range is too subtle","2019-11 (Belfast)","|Complete|","15.0",""
 "`P1871R1 <https://wg21.link/P1871R1>`__","Concept traits should be named after concepts","2019-11 (Belfast)","|Complete|","14.0",""
 "`P1872R0 <https://wg21.link/P1872R0>`__","span should have size_type, not index_type","2019-11 (Belfast)","|Complete|","10.0",""
diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index 0f33885f7df37..ee54fa39fc3d6 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -774,6 +774,10 @@ void StringChunk::writeTo(uint8_t *buf) const {
   buf[str.size()] = '\0';
 }
 
+ImportThunkChunk::ImportThunkChunk(COFFLinkerContext &ctx, Defined *s)
+    : NonSectionCodeChunk(ImportThunkKind), live(!ctx.config.doGC),
+      impSymbol(s), ctx(ctx) {}
+
 ImportThunkChunkX64::ImportThunkChunkX64(COFFLinkerContext &ctx, Defined *s)
     : ImportThunkChunk(ctx, s) {
   // Intel Optimization Manual says that all branch targets
diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 040a249aabf59..24d7c37de7f3b 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -557,10 +557,13 @@ static const uint8_t importThunkARM64EC[] = {
 // contents will be a JMP instruction to some __imp_ symbol.
 class ImportThunkChunk : public NonSectionCodeChunk {
 public:
-  ImportThunkChunk(COFFLinkerContext &ctx, Defined *s)
-      : NonSectionCodeChunk(ImportThunkKind), impSymbol(s), ctx(ctx) {}
+  ImportThunkChunk(COFFLinkerContext &ctx, Defined *s);
   static bool classof(const Chunk *c) { return c->kind() == ImportThunkKind; }
 
+  // We track the usage of the thunk symbol separately from the import file
+  // to avoid generating unnecessary thunks.
+  bool live;
+
 protected:
   Defined *impSymbol;
   COFFLinkerContext &ctx;
@@ -598,13 +601,17 @@ class ImportThunkChunkARM : public ImportThunkChunk {
 
 class ImportThunkChunkARM64 : public ImportThunkChunk {
 public:
-  explicit ImportThunkChunkARM64(COFFLinkerContext &ctx, Defined *s)
-      : ImportThunkChunk(ctx, s) {
+  explicit ImportThunkChunkARM64(COFFLinkerContext &ctx, Defined *s,
+                                 MachineTypes machine)
+      : ImportThunkChunk(ctx, s), machine(machine) {
     setAlignment(4);
   }
   size_t getSize() const override { return sizeof(importThunkARM64); }
   void writeTo(uint8_t *buf) const override;
-  MachineTypes getMachine() const override { return ARM64; }
+  MachineTypes getMachine() const override { return machine; }
+
+private:
+  MachineTypes machine;
 };
 
 // ARM64EC __impchk_* thunk implementation.
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 569220468e96a..94ad7f3ceb306 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -1002,7 +1002,7 @@ void ObjFile::enqueuePdbFile(StringRef path, ObjFile *fromFile) {
 }
 
 ImportFile::ImportFile(COFFLinkerContext &ctx, MemoryBufferRef m)
-    : InputFile(ctx, ImportKind, m), live(!ctx.config.doGC), thunkLive(live) {}
+    : InputFile(ctx, ImportKind, m), live(!ctx.config.doGC) {}
 
 MachineTypes ImportFile::getMachineType() const {
   uint16_t machine =
@@ -1018,7 +1018,7 @@ ImportThunkChunk *ImportFile::makeImportThunk() {
   case I386:
     return make<ImportThunkChunkX86>(ctx, impSym);
   case ARM64:
-    return make<ImportThunkChunkARM64>(ctx, impSym);
+    return make<ImportThunkChunkARM64>(ctx, impSym, ARM64);
   case ARMNT:
     return make<ImportThunkChunkARM>(ctx, impSym);
   }
@@ -1109,7 +1109,14 @@ void ImportFile::parse() {
     } else {
       thunkSym = ctx.symtab.addImportThunk(
           name, impSym, make<ImportThunkChunkX64>(ctx, impSym));
-      // FIXME: Add aux IAT symbols.
+
+      if (std::optional<std::string> mangledName =
+              getArm64ECMangledFunctionName(name)) {
+        StringRef auxThunkName = saver().save(*mangledName);
+        auxThunkSym = ctx.symtab.addImportThunk(
+            auxThunkName, impECSym,
+            make<ImportThunkChunkARM64>(ctx, impECSym, ARM64EC));
+      }
 
       StringRef impChkName = saver().save("__impchk_" + name);
       impchkThunk = make<ImportThunkChunkARM64EC>(this);
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index 8140a031f7116..acf221d85ae8f 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -365,17 +365,15 @@ class ImportFile : public InputFile {
   // Auxiliary IAT symbol and chunk on ARM64EC.
   DefinedImportData *impECSym = nullptr;
   Chunk *auxLocation = nullptr;
+  Symbol *auxThunkSym = nullptr;
 
   // We want to eliminate dllimported symbols if no one actually refers to them.
   // These "Live" bits are used to keep track of which import library members
   // are actually in use.
   //
   // If the Live bit is turned off by MarkLive, Writer will ignore dllimported
-  // symbols provided by this import library member. We also track whether the
-  // imported symbol is used separately from whether the thunk is used in order
-  // to avoid creating unnecessary thunks.
+  // symbols provided by this import library member.
   bool live;
-  bool thunkLive;
 };
 
 // Used for LTO.
diff --git a/lld/COFF/MapFile.cpp b/lld/COFF/MapFile.cpp
index ed521dd375ed0..52e9ce996f239 100644
--- a/lld/COFF/MapFile.cpp
+++ b/lld/COFF/MapFile.cpp
@@ -125,7 +125,7 @@ static void getSymbols(const COFFLinkerContext &ctx,
     if (!file->thunkSym)
       continue;
 
-    if (!file->thunkLive)
+    if (!file->thunkSym->isLive())
       continue;
 
     if (auto *thunkSym = dyn_cast<Defined>(file->thunkSym))
diff --git a/lld/COFF/MarkLive.cpp b/lld/COFF/MarkLive.cpp
index 8af58780e1358..3c09baa73a9f7 100644
--- a/lld/COFF/MarkLive.cpp
+++ b/lld/COFF/MarkLive.cpp
@@ -58,7 +58,7 @@ void markLive(COFFLinkerContext &ctx) {
       addImportFile(sym->file);
     } else if (auto *sym = dyn_cast<DefinedImportThunk>(b)) {
       addImportFile(sym->wrappedSym->file);
-      sym->wrappedSym->file->thunkLive = true;
+      sym->getChunk()->live = true;
     }
   };
 
diff --git a/lld/COFF/PDB.cpp b/lld/COFF/PDB.cpp
index c0739b37aeb0f..9b035f53ef49c 100644
--- a/lld/COFF/PDB.cpp
+++ b/lld/COFF/PDB.cpp
@@ -1527,8 +1527,8 @@ void PDBLinker::addImportFilesToPDB() {
     if (!file->thunkSym)
       continue;
 
-    if (!file->thunkLive)
-        continue;
+    if (!file->thunkSym->isLive())
+      continue;
 
     std::string dll = StringRef(file->dllName).lower();
     llvm::pdb::DbiModuleDescriptorBuilder *&mod = dllToModuleDbi[dll];
diff --git a/lld/COFF/Symbols.cpp b/lld/COFF/Symbols.cpp
index 5f4d797f74a2d..567c2b93776c9 100644
--- a/lld/COFF/Symbols.cpp
+++ b/lld/COFF/Symbols.cpp
@@ -84,7 +84,7 @@ bool Symbol::isLive() const {
   if (auto *imp = dyn_cast<DefinedImportData>(this))
     return imp->file->live;
   if (auto *imp = dyn_cast<DefinedImportThunk>(this))
-    return imp->wrappedSym->file->thunkLive;
+    return imp->getChunk()->live;
   // Assume any other kind of symbol is live.
   return true;
 }
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index 2df60a01ec813..9b21e09bf83a4 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -395,12 +395,12 @@ class DefinedImportThunk : public Defined {
   }
 
   uint64_t getRVA() { return data->getRVA(); }
-  Chunk *getChunk() { return data; }
+  ImportThunkChunk *getChunk() const { return data; }
 
   DefinedImportData *wrappedSym;
 
 private:
-  Chunk *data;
+  ImportThunkChunk *data;
 };
 
 // If you have a symbol "foo" in your object file, a symbol name
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 9a8040008e73c..216db652c10aa 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1252,14 +1252,22 @@ void Writer::appendImportThunks() {
     if (!file->live)
       continue;
 
-    if (!file->thunkSym)
-      continue;
+    if (file->thunkSym) {
+      if (!isa<DefinedImportThunk>(file->thunkSym))
+        fatal(toString(ctx, *file->thunkSym) + " was replaced");
+      auto *chunk = cast<DefinedImportThunk>(file->thunkSym)->getChunk();
+      if (chunk->live)
+        textSec->addChunk(chunk);
+    }
+
+    if (file->auxThunkSym) {
+      if (!isa<DefinedImportThunk>(file->auxThunkSym))
+        fatal(toString(ctx, *file->auxThunkSym) + " was replaced");
+      auto *chunk = cast<DefinedImportThunk>(file->auxThunkSym)->getChunk();
+      if (chunk->live)
+        textSec->addChunk(chunk);
+    }
 
-    if (!isa<DefinedImportThunk>(file->thunkSym))
-      fatal(toString(ctx, *file->thunkSym) + " was replaced");
-    DefinedImportThunk *thunk = cast<DefinedImportThunk>(file->thunkSym);
-    if (file->thunkLive)
-      textSec->addChunk(thunk->getChunk());
     if (file->impchkThunk)
       textSec->addChunk(file->impchkThunk);
   }
diff --git a/lld/test/COFF/arm64ec-import.test b/lld/test/COFF/arm64ec-import.test
index f8279cefc3bcf..e403daa41f368 100644
--- a/lld/test/COFF/arm64ec-import.test
+++ b/lld/test/COFF/arm64ec-import.test
@@ -39,25 +39,31 @@ RUN: llvm-objdump -d out2.dll | FileCheck --check-prefix=DISASM %s
 
 DISASM:      180001000: 52800000     mov     w0, #0x0                // =0
 DISASM-NEXT: 180001004: d65f03c0     ret
-DISASM-NEXT: 180001008: d000000b     adrp    x11, 0x180003000
-DISASM-NEXT: 18000100c: f940056b     ldr     x11, [x11, #0x8]
-DISASM-NEXT: 180001010: 9000000a     adrp    x10, 0x180001000 <.text>
-DISASM-NEXT: 180001014: 9101114a     add     x10, x10, #0x44
-DISASM-NEXT: 180001018: 17fffffa     b       0x180001000 <.text>
-DISASM-NEXT: 18000101c: d000000b     adrp    x11, 0x180003000
-DISASM-NEXT: 180001020: f940096b     ldr     x11, [x11, #0x10]
-DISASM-NEXT: 180001024: f0ffffea     adrp    x10, 0x180000000
-DISASM-NEXT: 180001028: 9100014a     add     x10, x10, #0x0
-DISASM-NEXT: 18000102c: 17fffff5     b       0x180001000 <.text>
-DISASM-NEXT: 180001030: d000000b     adrp    x11, 0x180003000
-DISASM-NEXT: 180001034: f940116b     ldr     x11, [x11, #0x20]
-DISASM-NEXT: 180001038: 9000000a     adrp    x10, 0x180001000 <.text>
-DISASM-NEXT: 18000103c: 9101314a     add     x10, x10, #0x4c
-DISASM-NEXT: 180001040: 17fffff0     b       0x180001000 <.text>
-DISASM-NEXT: 180001044: 52800020     mov     w0, #0x1                // =1
-DISASM-NEXT: 180001048: d65f03c0     ret
-DISASM-NEXT: 18000104c: 52800040     mov     w0, #0x2                // =2
-DISASM-NEXT: 180001050: d65f03c0     ret
+DISASM-NEXT: 180001008: 90000030     adrp    x16, 0x180005000
+DISASM-NEXT: 18000100c: f9400610     ldr     x16, [x16, #0x8]
+DISASM-NEXT: 180001010: d61f0200     br      x16
+DISASM-NEXT: 180001014: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 180001018: f940056b     ldr     x11, [x11, #0x8]
+DISASM-NEXT: 18000101c: 9000000a     adrp    x10, 0x180001000 <.text>
+DISASM-NEXT: 180001020: 9101714a     add     x10, x10, #0x5c
+DISASM-NEXT: 180001024: 17fffff7     b       0x180001000 <.text>
+DISASM-NEXT: 180001028: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 18000102c: f940096b     ldr     x11, [x11, #0x10]
+DISASM-NEXT: 180001030: f0ffffea     adrp    x10, 0x180000000
+DISASM-NEXT: 180001034: 9100014a     add     x10, x10, #0x0
+DISASM-NEXT: 180001038: 17fffff2     b       0x180001000 <.text>
+DISASM-NEXT: 18000103c: 90000030     adrp    x16, 0x180005000
+DISASM-NEXT: 180001040: f9401210     ldr     x16, [x16, #0x20]
+DISASM-NEXT: 180001044: d61f0200     br      x16
+DISASM-NEXT: 180001048: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 18000104c: f940116b     ldr     x11, [x11, #0x20]
+DISASM-NEXT: 180001050: 9000000a     adrp    x10, 0x180001000 <.text>
+DISASM-NEXT: 180001054: 9101914a     add     x10, x10, #0x64
+DISASM-NEXT: 180001058: 17ffffea     b       0x180001000 <.text>
+DISASM-NEXT: 18000105c: 52800020     mov     w0, #0x1                // =1
+DISASM-NEXT: 180001060: d65f03c0     ret
+DISASM-NEXT: 180001064: 52800040     mov     w0, #0x2                // =2
+DISASM-NEXT: 180001068: d65f03c0     ret
 DISASM-NEXT:                 ...
 DISASM-NEXT: 180002000: ff 25 02 10 00 00            jmpq    *0x1002(%rip)           # 0x180003008
 
@@ -65,7 +71,8 @@ RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s
 RUN: llvm-readobj --hex-dump=.test out2.dll | FileCheck --check-prefix=TESTSEC %s
 TESTSEC:      0x180007000 08500000 00300000 10500000 20500000
 TESTSEC-NEXT: 0x180007010 08300000 00500000 10300000 20300000
-TESTSEC-NEXT: 0x180007020 08100000 1c100000 00200000
+TESTSEC-NEXT: 0x180007020 14100000 28100000 00200000 08100000
+TESTSEC-NEXT: 0x180007030 3c100000
 
 RUN: llvm-readobj --headers out.dll | FileCheck -check-prefix=HEADERS %s
 HEADERS:  LoadConfigTableRVA: 0x4010
@@ -76,9 +83,9 @@ RUN: llvm-readobj --coff-load-config out.dll | FileCheck -check-prefix=LOADCONFI
 LOADCONFIG: AuxiliaryIAT: 0x5000
 
 RUN: llvm-readobj --hex-dump=.rdata out.dll | FileCheck -check-prefix=RDATA %s
-RDATA:      0x180005000 00000000 00000000 08100080 01000000
-RDATA-NEXT: 0x180005010 1c100080 01000000 00000000 00000000
-RDATA-NEXT: 0x180005020 30100080 01000000 00000000 00000000
+RDATA:      0x180005000 00000000 00000000 14100080 01000000
+RDATA-NEXT: 0x180005010 28100080 01000000 00000000 00000000
+RDATA-NEXT: 0x180005020 48100080 01000000 00000000 00000000
 
 RUN: llvm-readobj --coff-basereloc out.dll | FileCheck -check-prefix=BASERELOC %s
 BASERELOC:      BaseReloc [
@@ -110,6 +117,8 @@ arm64ec_data_sym:
     .rva __impchk_func
     .rva __impchk_func2
     .rva func
+    .rva "#func"
+    .rva "#t2func"
 
 #--- icall.s
     .text
diff --git a/lldb/docs/resources/sbapi.rst b/lldb/docs/resources/sbapi.rst
index cf32cc6c81558..4ca3909e0f291 100644
--- a/lldb/docs/resources/sbapi.rst
+++ b/lldb/docs/resources/sbapi.rst
@@ -72,6 +72,17 @@ building the LLDB framework for macOS, the headers are processed with
 ``unifdef`` prior to being copied into the framework bundle to remove macros
 involving SWIG.
 
+Another good principle when adding SB API methods is: if you find yourself
+implementing a significant algorithm in the SB API method, you should not do
+that, but instead look for and then add it - if not found - as a method in the
+underlying lldb_private class, and then call that from your SB API method.
+If it was a useful algorithm, it's very likely it already exists
+because the lldb_private code also needed to do it.  And if it doesn't at
+present, if it was a useful thing to do, it's likely someone will later need
+it in lldb_private and then we end up with two implementations of the same
+algorithm.  If we keep the SB API code to just what's needed to manage the SB
+objects and requests, we won't get into this situation.
+
 Lifetime
 --------
 Many SB API methods will return strings in the form of ``const char *`` values.
diff --git a/lldb/examples/synthetic/gnu_libstdcpp.py b/lldb/examples/synthetic/gnu_libstdcpp.py
index d98495b8a9df3..a6605a7a7eb5b 100644
--- a/lldb/examples/synthetic/gnu_libstdcpp.py
+++ b/lldb/examples/synthetic/gnu_libstdcpp.py
@@ -473,11 +473,7 @@ def get_child_at_index(self, index):
                 "[" + str(index) + "]", element_offset, element_type
             )
             bit = element.GetValueAsUnsigned(0) & (1 << bit_offset)
-            if bit != 0:
-                value_expr = "(bool)true"
-            else:
-                value_expr = "(bool)false"
-            return self.valobj.CreateValueFromExpression("[%d]" % index, value_expr)
+            return self.valobj.CreateBoolValue("[%d]" % index, bool(bit))
 
         def update(self):
             try:
diff --git a/lldb/include/lldb/API/SBValue.h b/lldb/include/lldb/API/SBValue.h
index bec816fb45184..9090cece80f7c 100644
--- a/lldb/include/lldb/API/SBValue.h
+++ b/lldb/include/lldb/API/SBValue.h
@@ -145,6 +145,8 @@ class LLDB_API SBValue {
   // AddressOf() on the return of this call all return invalid
   lldb::SBValue CreateValueFromData(const char *name, lldb::SBData data,
                                     lldb::SBType type);
+  // Returned value has no address.
+  lldb::SBValue CreateBoolValue(const char *name, bool value);
 
   /// Get a child value by index from a value.
   ///
diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp
index 273aac5ad4798..e1a31708d46ff 100644
--- a/lldb/source/API/SBValue.cpp
+++ b/lldb/source/API/SBValue.cpp
@@ -645,6 +645,22 @@ lldb::SBValue SBValue::CreateValueFromData(const char *name, SBData data,
   return sb_value;
 }
 
+lldb::SBValue SBValue::CreateBoolValue(const char *name, bool value) {
+  LLDB_INSTRUMENT_VA(this, name);
+
+  lldb::SBValue sb_value;
+  lldb::ValueObjectSP new_value_sp;
+  ValueLocker locker;
+  lldb::ValueObjectSP value_sp(GetSP(locker));
+  lldb::TargetSP target_sp = m_opaque_sp->GetTargetSP();
+  if (value_sp && target_sp) {
+    new_value_sp =
+        ValueObject::CreateValueObjectFromBool(target_sp, value, name);
+  }
+  sb_value.SetSP(new_value_sp);
+  return sb_value;
+}
+
 SBValue SBValue::GetChildAtIndex(uint32_t idx) {
   LLDB_INSTRUMENT_VA(this, idx);
 
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 06da83e26a26a..c36748963db37 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -3768,7 +3768,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
 
       SymbolType type = eSymbolTypeInvalid;
       SectionSP symbol_section;
-      lldb::addr_t symbol_byte_size = 0;
       bool add_nlist = true;
       bool is_gsym = false;
       bool demangled_is_synthesized = false;
@@ -4354,47 +4353,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
 
       if (symbol_section) {
         const addr_t section_file_addr = symbol_section->GetFileAddress();
-        if (symbol_byte_size == 0 && function_starts_count > 0) {
-          addr_t symbol_lookup_file_addr = nlist.n_value;
-          // Do an exact address match for non-ARM addresses, else get the
-          // closest since the symbol might be a thumb symbol which has an
-          // address with bit zero set.
-          FunctionStarts::Entry *func_start_entry =
-              function_starts.FindEntry(symbol_lookup_file_addr, !is_arm);
-          if (is_arm && func_start_entry) {
-            // Verify that the function start address is the symbol address
-            // (ARM) or the symbol address + 1 (thumb).
-            if (func_start_entry->addr != symbol_lookup_file_addr &&
-                func_start_entry->addr != (symbol_lookup_file_addr + 1)) {
-              // Not the right entry, NULL it out...
-              func_start_entry = nullptr;
-            }
-          }
-          if (func_start_entry) {
-            func_start_entry->data = true;
-
-            addr_t symbol_file_addr = func_start_entry->addr;
-            if (is_arm)
-              symbol_file_addr &= THUMB_ADDRESS_BIT_MASK;
-
-            const FunctionStarts::Entry *next_func_start_entry =
-                function_starts.FindNextEntry(func_start_entry);
-            const addr_t section_end_file_addr =
-                section_file_addr + symbol_section->GetByteSize();
-            if (next_func_start_entry) {
-              addr_t next_symbol_file_addr = next_func_start_entry->addr;
-              // Be sure the clear the Thumb address bit when we calculate the
-              // size from the current and next address
-              if (is_arm)
-                next_symbol_file_addr &= THUMB_ADDRESS_BIT_MASK;
-              symbol_byte_size = std::min<lldb::addr_t>(
-                  next_symbol_file_addr - symbol_file_addr,
-                  section_end_file_addr - symbol_file_addr);
-            } else {
-              symbol_byte_size = section_end_file_addr - symbol_file_addr;
-            }
-          }
-        }
         symbol_value -= section_file_addr;
       }
 
@@ -4501,9 +4459,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
       if (nlist.n_desc & N_WEAK_REF)
         sym[sym_idx].SetIsWeak(true);
 
-      if (symbol_byte_size > 0)
-        sym[sym_idx].SetByteSize(symbol_byte_size);
-
       if (demangled_is_synthesized)
         sym[sym_idx].SetDemangledNameIsSynthesized(true);
 
@@ -4622,23 +4577,7 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
           Address symbol_addr;
           if (module_sp->ResolveFileAddress(symbol_file_addr, symbol_addr)) {
             SectionSP symbol_section(symbol_addr.GetSection());
-            uint32_t symbol_byte_size = 0;
             if (symbol_section) {
-              const addr_t section_file_addr = symbol_section->GetFileAddress();
-              const FunctionStarts::Entry *next_func_start_entry =
-                  function_starts.FindNextEntry(func_start_entry);
-              const addr_t section_end_file_addr =
-                  section_file_addr + symbol_section->GetByteSize();
-              if (next_func_start_entry) {
-                addr_t next_symbol_file_addr = next_func_start_entry->addr;
-                if (is_arm)
-                  next_symbol_file_addr &= THUMB_ADDRESS_BIT_MASK;
-                symbol_byte_size = std::min<lldb::addr_t>(
-                    next_symbol_file_addr - symbol_file_addr,
-                    section_end_file_addr - symbol_file_addr);
-              } else {
-                symbol_byte_size = section_end_file_addr - symbol_file_addr;
-              }
               sym[sym_idx].SetID(synthetic_sym_id++);
               // Don't set the name for any synthetic symbols, the Symbol
               // object will generate one if needed when the name is accessed
@@ -4650,8 +4589,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
               add_symbol_addr(symbol_addr.GetFileAddress());
               if (symbol_flags)
                 sym[sym_idx].SetFlags(symbol_flags);
-              if (symbol_byte_size)
-                sym[sym_idx].SetByteSize(symbol_byte_size);
               ++sym_idx;
             }
           }
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index edc568a6b47e0..ca22dacb2ba6c 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -1218,3 +1218,15 @@ Status MinidumpFileBuilder::DumpFile() {
 
   return error;
 }
+
+void MinidumpFileBuilder::DeleteFile() noexcept {
+  Log *log = GetLog(LLDBLog::Object);
+
+  if (m_core_file) {
+    Status error = m_core_file->Close();
+    if (error.Fail())
+      LLDB_LOGF(log, "Failed to close minidump file: %s", error.AsCString());
+
+    m_core_file.reset();
+  }
+}
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
index 71001e26c00e9..72e5658718b3c 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
@@ -115,6 +115,9 @@ class MinidumpFileBuilder {
   // Run cleanup and write all remaining bytes to file
   lldb_private::Status DumpFile();
 
+  // Delete the file if it exists
+  void DeleteFile() noexcept;
+
 private:
   // Add data to the end of the buffer, if the buffer exceeds the flush level,
   // trigger a flush.
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
index 5da69dd4f2ce7..be47991bb09fc 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
@@ -55,6 +55,21 @@ size_t ObjectFileMinidump::GetModuleSpecifications(
   return 0;
 }
 
+struct DumpFailRemoveHolder {
+  DumpFailRemoveHolder(MinidumpFileBuilder &builder) : m_builder(builder) {}
+
+  ~DumpFailRemoveHolder() {
+    if (!m_success)
+      m_builder.DeleteFile();
+  }
+
+  void SetSuccess() { m_success = true; }
+
+private:
+  MinidumpFileBuilder &m_builder;
+  bool m_success = false;
+};
+
 bool ObjectFileMinidump::SaveCore(const lldb::ProcessSP &process_sp,
                                   lldb_private::SaveCoreOptions &options,
                                   lldb_private::Status &error) {
@@ -75,6 +90,7 @@ bool ObjectFileMinidump::SaveCore(const lldb::ProcessSP &process_sp,
   }
   MinidumpFileBuilder builder(std::move(maybe_core_file.get()), process_sp,
                               options);
+  DumpFailRemoveHolder request(builder);
 
   Log *log = GetLog(LLDBLog::Object);
   error = builder.AddHeaderAndCalculateDirectories();
@@ -133,5 +149,7 @@ bool ObjectFileMinidump::SaveCore(const lldb::ProcessSP &process_sp,
     return false;
   }
 
+  request.SetSuccess();
+
   return true;
 }
diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
index 97fff4b9f65a8..80b27571f43d5 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
@@ -319,9 +319,12 @@ void NativeProcessFreeBSD::MonitorSIGTRAP(lldb::pid_t pid) {
                info.pl_siginfo.si_addr);
 
       if (thread) {
+        auto &regctx = static_cast<NativeRegisterContextFreeBSD &>(
+            thread->GetRegisterContext());
         auto thread_info =
             m_threads_stepping_with_breakpoint.find(thread->GetID());
-        if (thread_info != m_threads_stepping_with_breakpoint.end()) {
+        if (thread_info != m_threads_stepping_with_breakpoint.end() &&
+            threads_info->second == regctx.GetPC()) {
           thread->SetStoppedByTrace();
           Status brkpt_error = RemoveBreakpoint(thread_info->second);
           if (brkpt_error.Fail())
diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
index 5c262db8db7fd..38b7092682873 100644
--- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
@@ -829,8 +829,11 @@ void NativeProcessLinux::MonitorBreakpoint(NativeThreadLinux &thread) {
   thread.SetStoppedByBreakpoint();
   FixupBreakpointPCAsNeeded(thread);
 
-  if (m_threads_stepping_with_breakpoint.find(thread.GetID()) !=
-      m_threads_stepping_with_breakpoint.end())
+  NativeRegisterContextLinux &reg_ctx = thread.GetRegisterContext();
+  auto stepping_with_bp_it =
+      m_threads_stepping_with_breakpoint.find(thread.GetID());
+  if (stepping_with_bp_it != m_threads_stepping_with_breakpoint.end() &&
+      stepping_with_bp_it->second == reg_ctx.GetPC())
     thread.SetStoppedByTrace();
 
   StopRunningThreads(thread.GetID());
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 5eaf9ce2a302a..271ff61a7188a 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -2317,6 +2317,8 @@ StateType ProcessGDBRemote::SetThreadStopInfo(StringExtractor &stop_packet) {
         StreamString ostr;
         ostr.Printf("%" PRIu64, wp_addr);
         description = std::string(ostr.GetString());
+      } else if (key.compare("swbreak") == 0 || key.compare("hwbreak") == 0) {
+        reason = "breakpoint";
       } else if (key.compare("library") == 0) {
         auto error = LoadModules();
         if (error) {
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
index 2cbe20ee10b1a..ccdb6653cf16f 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
@@ -493,3 +493,32 @@ def test_save_minidump_custom_save_style_duplicated_regions(self):
 
         finally:
             self.assertTrue(self.dbg.DeleteTarget(target))
+
+    @skipUnlessPlatform(["linux"])
+    def minidump_deleted_on_save_failure(self):
+        """Test that verifies the minidump file is deleted after an error"""
+
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
+            self.assertState(process.GetState(), lldb.eStateStopped)
+
+            custom_file = self.getBuildArtifact("core.should.be.deleted.custom.dmp")
+            options = lldb.SBSaveCoreOptions()
+            options.SetOutputFile(lldb.SBFileSpec(custom_file))
+            options.SetPluginName("minidump")
+            options.SetStyle(lldb.eSaveCoreCustomOnly)
+            # We set custom only and have no thread list and have no memory.
+            error = process.SaveCore(options)
+            self.assertTrue(error.Fail())
+            self.assertIn(
+                "no valid address ranges found for core style", error.GetCString()
+            )
+            self.assertTrue(not os.path.isfile(custom_file))
+
+        finally:
+            self.assertTrue(self.dbg.DeleteTarget(target))
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index a15af9adfa945..cf52cd1522847 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -207,7 +207,7 @@ on support follow.
      ``Zkt``           Supported
      ``Zmmul``         Supported
      ``Ztso``          Supported
-     ``Zvbb``          Assembly Support
+     ``Zvbb``          Supported
      ``Zvbc``          Assembly Support
      ``Zve32x``        (`Partially <#riscv-vlen-32-note>`__) Supported
      ``Zve32f``        (`Partially <#riscv-vlen-32-note>`__) Supported
@@ -217,7 +217,7 @@ on support follow.
      ``Zvfbfmin``      Supported
      ``Zvfbfwma``      Supported
      ``Zvfh``          Supported
-     ``Zvkb``          Assembly Support
+     ``Zvkb``          Suppported
      ``Zvkg``          Assembly Support
      ``Zvkn``          Assembly Support
      ``Zvknc``         Assembly Support
diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h
index 5e250641f3b3f..2cc669a966e0b 100644
--- a/llvm/include/llvm/SandboxIR/PassManager.h
+++ b/llvm/include/llvm/SandboxIR/PassManager.h
@@ -72,6 +72,7 @@ class PassRegistry {
   DenseMap<StringRef, Pass *> NameToPassMap;
 
 public:
+  static constexpr const char PassDelimToken = ',';
   PassRegistry() = default;
   /// Registers \p PassPtr and takes ownership.
   Pass &registerPass(std::unique_ptr<Pass> &&PassPtr) {
@@ -85,6 +86,9 @@ class PassRegistry {
     auto It = NameToPassMap.find(Name);
     return It != NameToPassMap.end() ? It->second : nullptr;
   }
+  /// Creates a pass pipeline and returns the first pass manager.
+  FunctionPassManager &parseAndCreatePassPipeline(StringRef Pipeline);
+
 #ifndef NDEBUG
   void print(raw_ostream &OS) const {
     for (const auto &PassPtr : Passes)
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 5b57d5cebc334..d21b8a85161e4 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -124,6 +124,7 @@ class ConstantAggregateZero;
 class ConstantPointerNull;
 class PoisonValue;
 class BlockAddress;
+class DSOLocalEquivalent;
 class ConstantTokenNone;
 class GlobalValue;
 class Context;
@@ -328,6 +329,7 @@ class Value {
   friend class PoisonValue;           // For `Val`.
   friend class BlockAddress;          // For `Val`.
   friend class GlobalValue;           // For `Val`.
+  friend class DSOLocalEquivalent;    // For `Val`.
 
   /// All values point to the context.
   Context &Ctx;
@@ -1218,6 +1220,38 @@ class BlockAddress final : public Constant {
   }
 };
 
+class DSOLocalEquivalent final : public Constant {
+  DSOLocalEquivalent(llvm::DSOLocalEquivalent *C, Context &Ctx)
+      : Constant(ClassID::DSOLocalEquivalent, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// Return a DSOLocalEquivalent for the specified global value.
+  static DSOLocalEquivalent *get(GlobalValue *GV);
+
+  GlobalValue *getGlobalValue() const;
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::DSOLocalEquivalent;
+  }
+
+  unsigned getUseOperandNo(const Use &Use) const final {
+    llvm_unreachable("DSOLocalEquivalent has no operands!");
+  }
+
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::DSOLocalEquivalent>(Val) &&
+           "Expected a DSOLocalEquivalent!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
 // TODO: This should inherit from ConstantData.
 class ConstantTokenNone final : public Constant {
   ConstantTokenNone(llvm::ConstantTokenNone *C, Context &Ctx)
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index 7b72f9b7173e6..c218ffee3ce38 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -38,6 +38,7 @@ DEF_CONST(GlobalVariable, GlobalVariable)
 DEF_CONST(GlobalIFunc, GlobalIFunc)
 DEF_CONST(GlobalAlias, GlobalAlias)
 DEF_CONST(BlockAddress, BlockAddress)
+DEF_CONST(DSOLocalEquivalent, DSOLocalEquivalent)
 DEF_CONST(ConstantTokenNone, ConstantTokenNone)
 
 #ifndef DEF_INSTR
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index cd160f54e6670..e5bf196559ba6 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -49,11 +49,13 @@ X86_CPU_TYPE(ZHAOXIN_FAM7H,       "zhaoxin_fam7h")
 X86_CPU_TYPE(INTEL_SIERRAFOREST,  "sierraforest")
 X86_CPU_TYPE(INTEL_GRANDRIDGE,    "grandridge")
 X86_CPU_TYPE(INTEL_CLEARWATERFOREST, "clearwaterforest")
+X86_CPU_TYPE(AMDFAM1AH,           "amdfam1ah")
 
 // Alternate names supported by __builtin_cpu_is and target multiversioning.
 X86_CPU_TYPE_ALIAS(INTEL_BONNELL,    "atom")
 X86_CPU_TYPE_ALIAS(AMDFAM10H,        "amdfam10")
 X86_CPU_TYPE_ALIAS(AMDFAM15H,        "amdfam15")
+X86_CPU_TYPE_ALIAS(AMDFAM1AH,        "amdfam1a")
 X86_CPU_TYPE_ALIAS(INTEL_SILVERMONT, "slm")
 
 #undef X86_CPU_TYPE_ALIAS
@@ -104,6 +106,7 @@ X86_CPU_SUBTYPE(INTEL_COREI7_GRANITERAPIDS_D,"graniterapids-d")
 X86_CPU_SUBTYPE(INTEL_COREI7_ARROWLAKE,      "arrowlake")
 X86_CPU_SUBTYPE(INTEL_COREI7_ARROWLAKE_S,    "arrowlake-s")
 X86_CPU_SUBTYPE(INTEL_COREI7_PANTHERLAKE,    "pantherlake")
+X86_CPU_SUBTYPE(AMDFAM1AH_ZNVER5,            "znver5")
 
 // Alternate names supported by __builtin_cpu_is and target multiversioning.
 X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_ALDERLAKE, "raptorlake")
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.h b/llvm/include/llvm/TargetParser/X86TargetParser.h
index 2083e585af4ac..0e17c4674719c 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.h
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.h
@@ -142,6 +142,7 @@ enum CPUKind {
   CK_ZNVER2,
   CK_ZNVER3,
   CK_ZNVER4,
+  CK_ZNVER5,
   CK_x86_64,
   CK_x86_64_v2,
   CK_x86_64_v3,
diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
index 41815c633fdf2..42e986e6179dd 100644
--- a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
+++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
@@ -1420,16 +1420,8 @@ void IRSimilarityIdentifier::findCandidates(
         // IRSimilarityCandidates that include that instruction.
         for (IRSimilarityCandidate &IRCand : SimilarityCandidates->back()) {
           for (unsigned Idx = IRCand.getStartIdx(), Edx = IRCand.getEndIdx();
-               Idx <= Edx; ++Idx) {
-            DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>>::iterator
-                IdIt;
-            IdIt = IndexToIncludedCand.find(Idx);
-            bool Inserted = false;
-            if (IdIt == IndexToIncludedCand.end())
-              std::tie(IdIt, Inserted) = IndexToIncludedCand.insert(
-                  std::make_pair(Idx, DenseSet<IRSimilarityCandidate *>()));
-            IdIt->second.insert(&IRCand);
-          }
+               Idx <= Edx; ++Idx)
+            IndexToIncludedCand[Idx].insert(&IRCand);
           // Add mapping of candidate to the overall similarity group number.
           CandToGroup.insert(
               std::make_pair(&IRCand, SimilarityCandidates->size() - 1));
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index a69dbbbbdab3c..a73a3aa59403b 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -2231,11 +2231,9 @@ void InstrRefBasedLDV::accumulateFragmentMap(MachineInstr &MI) {
   // If this is the first sighting of this variable, then we are guaranteed
   // there are currently no overlapping fragments either. Initialize the set
   // of seen fragments, record no overlaps for the current one, and return.
-  auto SeenIt = SeenFragments.find(MIVar.getVariable());
-  if (SeenIt == SeenFragments.end()) {
-    SmallSet<FragmentInfo, 4> OneFragment;
-    OneFragment.insert(ThisFragment);
-    SeenFragments.insert({MIVar.getVariable(), OneFragment});
+  auto [SeenIt, Inserted] = SeenFragments.try_emplace(MIVar.getVariable());
+  if (Inserted) {
+    SeenIt->second.insert(ThisFragment);
 
     OverlapFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
     return;
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index fab36f4858e09..8bcc437cbfb86 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -108,9 +108,10 @@ static std::optional<DestSourcePair> isCopyInstr(const MachineInstr &MI,
 
 class CopyTracker {
   struct CopyInfo {
-    MachineInstr *MI, *LastSeenUseInCopy;
+    MachineInstr *MI = nullptr;
+    MachineInstr *LastSeenUseInCopy = nullptr;
     SmallVector<MCRegister, 4> DefRegs;
-    bool Avail;
+    bool Avail = false;
   };
 
   DenseMap<MCRegUnit, CopyInfo> Copies;
@@ -240,8 +241,7 @@ class CopyTracker {
     // Remember source that's copied to Def. Once it's clobbered, then
     // it's no longer available for copy propagation.
     for (MCRegUnit Unit : TRI.regunits(Src)) {
-      auto I = Copies.insert({Unit, {nullptr, nullptr, {}, false}});
-      auto &Copy = I.first->second;
+      auto &Copy = Copies[Unit];
       if (!is_contained(Copy.DefRegs, Def))
         Copy.DefRegs.push_back(Def);
       Copy.LastSeenUseInCopy = MI;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index bb907633e1f82..fe8ae5c9e9af6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15142,26 +15142,42 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   // Note: We only run this optimization after type legalization (which often
   // creates this pattern) and before operation legalization after which
   // we need to be more careful about the vector instructions that we generate.
-  if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
-    EVT VecTy = N0.getOperand(0).getValueType();
-    EVT ExTy = N0.getValueType();
+  if (LegalTypes && !LegalOperations && VT.isScalarInteger() && VT != MVT::i1 &&
+      N0->hasOneUse()) {
     EVT TrTy = N->getValueType(0);
+    SDValue Src = N0;
+
+    // Check for cases where we shift down an upper element before truncation.
+    int EltOffset = 0;
+    if (Src.getOpcode() == ISD::SRL && Src.getOperand(0)->hasOneUse()) {
+      if (auto ShAmt = DAG.getValidShiftAmount(Src)) {
+        if ((*ShAmt % TrTy.getSizeInBits()) == 0) {
+          Src = Src.getOperand(0);
+          EltOffset = *ShAmt / TrTy.getSizeInBits();
+        }
+      }
+    }
 
-    auto EltCnt = VecTy.getVectorElementCount();
-    unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();
-    auto NewEltCnt = EltCnt * SizeRatio;
+    if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      EVT VecTy = Src.getOperand(0).getValueType();
+      EVT ExTy = Src.getValueType();
 
-    EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, NewEltCnt);
-    assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
+      auto EltCnt = VecTy.getVectorElementCount();
+      unsigned SizeRatio = ExTy.getSizeInBits() / TrTy.getSizeInBits();
+      auto NewEltCnt = EltCnt * SizeRatio;
 
-    SDValue EltNo = N0->getOperand(1);
-    if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
-      int Elt = EltNo->getAsZExtVal();
-      int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
-                         DAG.getBitcast(NVT, N0.getOperand(0)),
-                         DAG.getVectorIdxConstant(Index, DL));
+      EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, NewEltCnt);
+      assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
+
+      SDValue EltNo = Src->getOperand(1);
+      if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
+        int Elt = EltNo->getAsZExtVal();
+        int Index = isLE ? (Elt * SizeRatio + EltOffset)
+                         : (Elt * SizeRatio + (SizeRatio - 1) - EltOffset);
+        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
+                           DAG.getBitcast(NVT, Src.getOperand(0)),
+                           DAG.getVectorIdxConstant(Index, DL));
+      }
     }
   }
 
diff --git a/llvm/lib/SandboxIR/PassManager.cpp b/llvm/lib/SandboxIR/PassManager.cpp
index 2dd19e74734db..4abd39b28e87a 100644
--- a/llvm/lib/SandboxIR/PassManager.cpp
+++ b/llvm/lib/SandboxIR/PassManager.cpp
@@ -20,6 +20,38 @@ bool FunctionPassManager::runOnFunction(Function &F) {
   // TODO: Check ChangeAll against hashes before/after.
   return Change;
 }
+
+FunctionPassManager &
+PassRegistry::parseAndCreatePassPipeline(StringRef Pipeline) {
+  static constexpr const char EndToken = '\0';
+  // Add EndToken to the end to ease parsing.
+  std::string PipelineStr = std::string(Pipeline) + EndToken;
+  int FlagBeginIdx = 0;
+  // Start with a FunctionPassManager.
+  auto &InitialPM = static_cast<FunctionPassManager &>(
+      registerPass(std::make_unique<FunctionPassManager>("init-fpm")));
+
+  for (auto [Idx, C] : enumerate(PipelineStr)) {
+    // Keep moving Idx until we find the end of the pass name.
+    bool FoundDelim = C == EndToken || C == PassDelimToken;
+    if (!FoundDelim)
+      continue;
+    unsigned Sz = Idx - FlagBeginIdx;
+    std::string PassName(&PipelineStr[FlagBeginIdx], Sz);
+    FlagBeginIdx = Idx + 1;
+
+    // Get the pass that corresponds to PassName and add it to the pass manager.
+    auto *Pass = getPassByName(PassName);
+    if (Pass == nullptr) {
+      errs() << "Pass '" << PassName << "' not registered!\n";
+      exit(1);
+    }
+    // TODO: This is safe for now, but would require proper upcasting once we
+    // add more Pass sub-classes.
+    InitialPM.addPass(static_cast<FunctionPass *>(Pass));
+  }
+  return InitialPM;
+}
 #ifndef NDEBUG
 void PassRegistry::dump() const {
   print(dbgs());
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 8a7c3981e6680..04243564809db 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -2535,6 +2535,16 @@ BasicBlock *BlockAddress::getBasicBlock() const {
       Ctx.getValue(cast<llvm::BlockAddress>(Val)->getBasicBlock()));
 }
 
+DSOLocalEquivalent *DSOLocalEquivalent::get(GlobalValue *GV) {
+  auto *LLVMC = llvm::DSOLocalEquivalent::get(cast<llvm::GlobalValue>(GV->Val));
+  return cast<DSOLocalEquivalent>(GV->getContext().getValue(LLVMC));
+}
+
+GlobalValue *DSOLocalEquivalent::getGlobalValue() const {
+  return cast<GlobalValue>(
+      Ctx.getValue(cast<llvm::DSOLocalEquivalent>(Val)->getGlobalValue()));
+}
+
 ConstantTokenNone *ConstantTokenNone::get(Context &Ctx) {
   auto *LLVMC = llvm::ConstantTokenNone::get(Ctx.LLVMCtx);
   return cast<ConstantTokenNone>(Ctx.getOrCreateConstant(LLVMC));
@@ -2669,6 +2679,14 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
       It->second = std::unique_ptr<UndefValue>(
           new UndefValue(cast<llvm::UndefValue>(C), *this));
       return It->second.get();
+    case llvm::Value::DSOLocalEquivalentVal: {
+      auto *DSOLE = cast<llvm::DSOLocalEquivalent>(C);
+      It->second = std::unique_ptr<DSOLocalEquivalent>(
+          new DSOLocalEquivalent(DSOLE, *this));
+      auto *Ret = It->second.get();
+      getOrCreateValueInternal(DSOLE->getGlobalValue(), DSOLE);
+      return Ret;
+    }
     case llvm::Value::ConstantArrayVal:
       It->second = std::unique_ptr<ConstantArray>(
           new ConstantArray(cast<llvm::ConstantArray>(C), *this));
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index db5cd1d32d73d..3957d21ea695b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -179,7 +179,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v2s32, v4s32)
       .clampNumElements(0, v2s64, v2s64)
       .moreElementsToNextPow2(0)
-      .minScalarSameAs(1, 0);
+      .minScalarSameAs(1, 0)
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
 
   getActionDefinitionsBuilder(G_PTR_ADD)
       .legalFor({{p0, s64}, {v2p0, v2s64}})
@@ -542,6 +543,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarOrEltToNextPow2(1)
       .clampScalar(1, s32, s64)
       .clampScalar(0, s32, s32)
+      .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
       .minScalarEltSameAsIf(
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
@@ -784,6 +786,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, s64)
       .clampScalar(1, s32, s32)
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
       .lowerIf(isVector(0));
 
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index fc276d1063281..b7ed9de6ca84d 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -448,8 +448,12 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
     if (Reg) {
       // Fixed-length vectors are located in the corresponding scalable-vector
       // container types.
-      if (ValVT.isFixedLengthVector())
+      if (ValVT.isFixedLengthVector()) {
         LocVT = TLI.getContainerForFixedLengthVector(LocVT);
+        State.addLoc(
+            CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+        return false;
+      }
     } else {
       // For return values, the vector must be passed fully via registers or
       // via the stack.
@@ -583,8 +587,12 @@ bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
     if (MCRegister Reg = allocateRVVReg(ValVT, ValNo, State, TLI)) {
       // Fixed-length vectors are located in the corresponding scalable-vector
       // container types.
-      if (LocVT.isFixedLengthVector())
+      if (LocVT.isFixedLengthVector()) {
         LocVT = TLI.getContainerForFixedLengthVector(LocVT);
+        State.addLoc(
+            CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+        return false;
+      }
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6f2dc710cb3d4..ab49315c12d68 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -19090,20 +19090,18 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
   if (VA.needsCustom()) {
     if (VA.getLocVT().isInteger() &&
         (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
-      Val = DAG.getNode(RISCVISD::FMV_H_X, DL, VA.getValVT(), Val);
-    else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
-      Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
-    else
-      llvm_unreachable("Unexpected Custom handling.");
-    return Val;
+      return DAG.getNode(RISCVISD::FMV_H_X, DL, VA.getValVT(), Val);
+    if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
+      return DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
+    if (VA.getValVT().isFixedLengthVector() && VA.getLocVT().isScalableVector())
+      return convertFromScalableVector(VA.getValVT(), Val, DAG, Subtarget);
+    llvm_unreachable("Unexpected Custom handling.");
   }
 
   switch (VA.getLocInfo()) {
   default:
     llvm_unreachable("Unexpected CCValAssign::LocInfo");
   case CCValAssign::Full:
-    if (VA.getValVT().isFixedLengthVector() && VA.getLocVT().isScalableVector())
-      Val = convertFromScalableVector(VA.getValVT(), Val, DAG, Subtarget);
     break;
   case CCValAssign::BCvt:
     Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
@@ -19155,20 +19153,18 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
   if (VA.needsCustom()) {
     if (LocVT.isInteger() &&
         (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
-      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, LocVT, Val);
-    else if (LocVT == MVT::i64 && VA.getValVT() == MVT::f32)
-      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
-    else
-      llvm_unreachable("Unexpected Custom handling.");
-    return Val;
+      return DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, LocVT, Val);
+    if (LocVT == MVT::i64 && VA.getValVT() == MVT::f32)
+      return DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
+    if (VA.getValVT().isFixedLengthVector() && LocVT.isScalableVector())
+      return convertToScalableVector(LocVT, Val, DAG, Subtarget);
+    llvm_unreachable("Unexpected Custom handling.");
   }
 
   switch (VA.getLocInfo()) {
   default:
     llvm_unreachable("Unexpected CCValAssign::LocInfo");
   case CCValAssign::Full:
-    if (VA.getValVT().isFixedLengthVector() && LocVT.isScalableVector())
-      Val = convertToScalableVector(LocVT, Val, DAG, Subtarget);
     break;
   case CCValAssign::BCvt:
     Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 6b4e47a49eb17..735f9dcefb97f 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -434,7 +434,8 @@ class X86AsmParser : public MCTargetAsmParser {
 
   class IntelExprStateMachine {
     IntelExprState State = IES_INIT, PrevState = IES_ERROR;
-    unsigned BaseReg = 0, IndexReg = 0, TmpReg = 0, Scale = 0;
+    MCRegister BaseReg, IndexReg, TmpReg;
+    unsigned Scale = 0;
     int64_t Imm = 0;
     const MCExpr *Sym = nullptr;
     StringRef SymName;
@@ -468,8 +469,8 @@ class X86AsmParser : public MCTargetAsmParser {
     bool isBracketUsed() const { return BracketUsed; }
     bool isOffsetOperator() const { return OffsetOperator; }
     SMLoc getOffsetLoc() const { return OffsetOperatorLoc; }
-    unsigned getBaseReg() const { return BaseReg; }
-    unsigned getIndexReg() const { return IndexReg; }
+    MCRegister getBaseReg() const { return BaseReg; }
+    MCRegister getIndexReg() const { return IndexReg; }
     unsigned getScale() const { return Scale; }
     const MCExpr *getSym() const { return Sym; }
     StringRef getSymName() const { return SymName; }
@@ -791,7 +792,7 @@ class X86AsmParser : public MCTargetAsmParser {
       }
       PrevState = CurrState;
     }
-    bool onRegister(unsigned Reg, StringRef &ErrMsg) {
+    bool onRegister(MCRegister Reg, StringRef &ErrMsg) {
       IntelExprState CurrState = State;
       switch (State) {
       default:
@@ -1111,8 +1112,8 @@ class X86AsmParser : public MCTargetAsmParser {
 
   std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
   std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
-  bool IsSIReg(unsigned Reg);
-  unsigned GetSIDIForRegClass(unsigned RegClassID, unsigned Reg, bool IsSIReg);
+  bool IsSIReg(MCRegister Reg);
+  MCRegister GetSIDIForRegClass(unsigned RegClassID, bool IsSIReg);
   void
   AddDefaultSrcDestOperands(OperandVector &Operands,
                             std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
@@ -1145,14 +1146,14 @@ class X86AsmParser : public MCTargetAsmParser {
   void tryParseOperandIdx(AsmToken::TokenKind PrevTK,
                           IntelExprStateMachine &SM);
 
-  bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc,
+  bool ParseMemOperand(MCRegister SegReg, const MCExpr *Disp, SMLoc StartLoc,
                        SMLoc EndLoc, OperandVector &Operands);
 
   X86::CondCode ParseConditionCode(StringRef CCode);
 
   bool ParseIntelMemoryOperandSize(unsigned &Size);
-  bool CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp,
-                               unsigned BaseReg, unsigned IndexReg,
+  bool CreateMemForMSInlineAsm(MCRegister SegReg, const MCExpr *Disp,
+                               MCRegister BaseReg, MCRegister IndexReg,
                                unsigned Scale, bool NonAbsMem, SMLoc Start,
                                SMLoc End, unsigned Size, StringRef Identifier,
                                const InlineAsmIdentifierInfo &Info,
@@ -1300,14 +1301,15 @@ class X86AsmParser : public MCTargetAsmParser {
 #define GET_SUBTARGET_FEATURE_NAME
 #include "X86GenAsmMatcher.inc"
 
-static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
-                                            unsigned Scale, bool Is64BitMode,
+static bool CheckBaseRegAndIndexRegAndScale(MCRegister BaseReg,
+                                            MCRegister IndexReg, unsigned Scale,
+                                            bool Is64BitMode,
                                             StringRef &ErrMsg) {
   // If we have both a base register and an index register make sure they are
   // both 64-bit or 32-bit registers.
   // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
 
-  if (BaseReg != 0 &&
+  if (BaseReg &&
       !(BaseReg == X86::RIP || BaseReg == X86::EIP ||
         X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) ||
         X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) ||
@@ -1316,7 +1318,7 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
     return true;
   }
 
-  if (IndexReg != 0 &&
+  if (IndexReg &&
       !(IndexReg == X86::EIZ || IndexReg == X86::RIZ ||
         X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
@@ -1328,9 +1330,9 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
     return true;
   }
 
-  if (((BaseReg == X86::RIP || BaseReg == X86::EIP) && IndexReg != 0) ||
-      IndexReg == X86::EIP || IndexReg == X86::RIP ||
-      IndexReg == X86::ESP || IndexReg == X86::RSP) {
+  if (((BaseReg == X86::RIP || BaseReg == X86::EIP) && IndexReg) ||
+      IndexReg == X86::EIP || IndexReg == X86::RIP || IndexReg == X86::ESP ||
+      IndexReg == X86::RSP) {
     ErrMsg = "invalid base+index expression";
     return true;
   }
@@ -1344,13 +1346,13 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
     return true;
   }
 
-  if (BaseReg == 0 &&
+  if (!BaseReg &&
       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
     ErrMsg = "16-bit memory operand may not include only index register";
     return true;
   }
 
-  if (BaseReg != 0 && IndexReg != 0) {
+  if (BaseReg && IndexReg) {
     if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
         (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
          X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
@@ -1380,8 +1382,7 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
   }
 
   // RIP/EIP-relative addressing is only supported in 64-bit mode.
-  if (!Is64BitMode && BaseReg != 0 &&
-      (BaseReg == X86::RIP || BaseReg == X86::EIP)) {
+  if (!Is64BitMode && (BaseReg == X86::RIP || BaseReg == X86::EIP)) {
     ErrMsg = "IP-relative addressing requires 64-bit mode";
     return true;
   }
@@ -1608,7 +1609,8 @@ ParseStatus X86AsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
 
 std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
   bool Parse32 = is32BitMode() || Code16GCC;
-  unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
+  MCRegister Basereg =
+      is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
   const MCExpr *Disp = MCConstantExpr::create(0, getContext());
   return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
                                /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
@@ -1617,15 +1619,16 @@ std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
 
 std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
   bool Parse32 = is32BitMode() || Code16GCC;
-  unsigned Basereg = is64BitMode() ? X86::RDI : (Parse32 ? X86::EDI : X86::DI);
+  MCRegister Basereg =
+      is64BitMode() ? X86::RDI : (Parse32 ? X86::EDI : X86::DI);
   const MCExpr *Disp = MCConstantExpr::create(0, getContext());
   return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
                                /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
                                Loc, Loc, 0);
 }
 
-bool X86AsmParser::IsSIReg(unsigned Reg) {
-  switch (Reg) {
+bool X86AsmParser::IsSIReg(MCRegister Reg) {
+  switch (Reg.id()) {
   default: llvm_unreachable("Only (R|E)SI and (R|E)DI are expected!");
   case X86::RSI:
   case X86::ESI:
@@ -1638,8 +1641,7 @@ bool X86AsmParser::IsSIReg(unsigned Reg) {
   }
 }
 
-unsigned X86AsmParser::GetSIDIForRegClass(unsigned RegClassID, unsigned Reg,
-                                          bool IsSIReg) {
+MCRegister X86AsmParser::GetSIDIForRegClass(unsigned RegClassID, bool IsSIReg) {
   switch (RegClassID) {
   default: llvm_unreachable("Unexpected register class");
   case X86::GR64RegClassID:
@@ -1690,8 +1692,8 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
           // Return false and let a normal complaint about bogus operands happen
           return false;
 
-        unsigned OrigReg = OrigOp.Mem.BaseReg;
-        unsigned FinalReg = FinalOp.Mem.BaseReg;
+        MCRegister OrigReg = OrigOp.Mem.BaseReg;
+        MCRegister FinalReg = FinalOp.Mem.BaseReg;
 
         // If we've already encounterd a register class, make sure all register
         // bases are of the same register class
@@ -1713,7 +1715,7 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
           return false;
 
         bool IsSI = IsSIReg(FinalReg);
-        FinalReg = GetSIDIForRegClass(RegClassID, FinalReg, IsSI);
+        FinalReg = GetSIDIForRegClass(RegClassID, IsSI);
 
         if (FinalReg != OrigReg) {
           std::string RegName = IsSI ? "ES:(R|E)SI" : "ES:(R|E)DI";
@@ -1753,13 +1755,11 @@ bool X86AsmParser::parseOperand(OperandVector &Operands, StringRef Name) {
   return parseATTOperand(Operands);
 }
 
-bool X86AsmParser::CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp,
-                                           unsigned BaseReg, unsigned IndexReg,
-                                           unsigned Scale, bool NonAbsMem,
-                                           SMLoc Start, SMLoc End,
-                                           unsigned Size, StringRef Identifier,
-                                           const InlineAsmIdentifierInfo &Info,
-                                           OperandVector &Operands) {
+bool X86AsmParser::CreateMemForMSInlineAsm(
+    MCRegister SegReg, const MCExpr *Disp, MCRegister BaseReg,
+    MCRegister IndexReg, unsigned Scale, bool NonAbsMem, SMLoc Start, SMLoc End,
+    unsigned Size, StringRef Identifier, const InlineAsmIdentifierInfo &Info,
+    OperandVector &Operands) {
   // If we found a decl other than a VarDecl, then assume it is a FuncDecl or
   // some other label reference.
   if (Info.isKind(InlineAsmIdentifierInfo::IK_Label)) {
@@ -2651,10 +2651,10 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
   }
 
   StringRef ErrMsg;
-  unsigned BaseReg = SM.getBaseReg();
-  unsigned IndexReg = SM.getIndexReg();
+  MCRegister BaseReg = SM.getBaseReg();
+  MCRegister IndexReg = SM.getIndexReg();
   if (IndexReg && BaseReg == X86::RIP)
-    BaseReg = 0;
+    BaseReg = MCRegister();
   unsigned Scale = SM.getScale();
   if (!PtrInOperand)
     Size = SM.getElementSize() << 3;
@@ -2703,7 +2703,7 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
 
   // When parsing x64 MS-style assembly, all non-absolute references to a named
   // variable default to RIP-relative.
-  unsigned DefaultBaseReg = X86::NoRegister;
+  MCRegister DefaultBaseReg;
   bool MaybeDirectBranchDest = true;
 
   if (Parser.isParsingMasm()) {
@@ -2738,7 +2738,7 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
       MaybeDirectBranchDest = false;
   }
 
-  if ((BaseReg || IndexReg || RegNo || DefaultBaseReg != X86::NoRegister))
+  if ((BaseReg || IndexReg || RegNo || DefaultBaseReg))
     Operands.push_back(X86Operand::CreateMem(
         getPointerWidth(), RegNo, Disp, BaseReg, IndexReg, Scale, Start, End,
         Size, DefaultBaseReg, /*SymName=*/StringRef(), /*OpDecl=*/nullptr,
@@ -2782,7 +2782,7 @@ bool X86AsmParser::parseATTOperand(OperandVector &Operands) {
 
     SMLoc Loc = Parser.getTok().getLoc(), EndLoc;
     const MCExpr *Expr = nullptr;
-    unsigned Reg = 0;
+    MCRegister Reg;
     if (getLexer().isNot(AsmToken::LParen)) {
       // No '(' so this is either a displacement expression or a register.
       if (Parser.parseExpression(Expr, EndLoc))
@@ -2954,7 +2954,7 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) {
 
 /// ParseMemOperand: 'seg : disp(basereg, indexreg, scale)'.  The '%ds:' prefix
 /// has already been parsed if present. disp may be provided as well.
-bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
+bool X86AsmParser::ParseMemOperand(MCRegister SegReg, const MCExpr *Disp,
                                    SMLoc StartLoc, SMLoc EndLoc,
                                    OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
@@ -3041,7 +3041,8 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
 
   // If we reached here, then eat the '(' and Process
   // the rest of the memory operand.
-  unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+  MCRegister BaseReg, IndexReg;
+  unsigned Scale = 1;
   SMLoc BaseLoc = getLexer().getLoc();
   const MCExpr *E;
   StringRef ErrMsg;
@@ -3888,14 +3889,14 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
   uint64_t TSFlags = MII.get(Opcode).TSFlags;
   if (isVFCMADDCPH(Opcode) || isVFCMADDCSH(Opcode) || isVFMADDCPH(Opcode) ||
       isVFMADDCSH(Opcode)) {
-    unsigned Dest = Inst.getOperand(0).getReg();
+    MCRegister Dest = Inst.getOperand(0).getReg();
     for (unsigned i = 2; i < Inst.getNumOperands(); i++)
       if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
         return Warning(Ops[0]->getStartLoc(), "Destination register should be "
                                               "distinct from source registers");
   } else if (isVFCMULCPH(Opcode) || isVFCMULCSH(Opcode) || isVFMULCPH(Opcode) ||
              isVFMULCSH(Opcode)) {
-    unsigned Dest = Inst.getOperand(0).getReg();
+    MCRegister Dest = Inst.getOperand(0).getReg();
     // The mask variants have different operand list. Scan from the third
     // operand to avoid emitting incorrect warning.
     //    VFMULCPHZrr   Dest, Src1, Src2
@@ -3909,8 +3910,9 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
   } else if (isV4FMADDPS(Opcode) || isV4FMADDSS(Opcode) ||
              isV4FNMADDPS(Opcode) || isV4FNMADDSS(Opcode) ||
              isVP4DPWSSDS(Opcode) || isVP4DPWSSD(Opcode)) {
-    unsigned Src2 = Inst.getOperand(Inst.getNumOperands() -
-                                    X86::AddrNumOperands - 1).getReg();
+    MCRegister Src2 =
+        Inst.getOperand(Inst.getNumOperands() - X86::AddrNumOperands - 1)
+            .getReg();
     unsigned Src2Enc = MRI->getEncodingValue(Src2);
     if (Src2Enc % 4 != 0) {
       StringRef RegName = X86IntelInstPrinter::getRegisterName(Src2);
@@ -3946,9 +3948,9 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
   } else if (isTCMMIMFP16PS(Opcode) || isTCMMRLFP16PS(Opcode) ||
              isTDPBF16PS(Opcode) || isTDPFP16PS(Opcode) || isTDPBSSD(Opcode) ||
              isTDPBSUD(Opcode) || isTDPBUSD(Opcode) || isTDPBUUD(Opcode)) {
-    unsigned SrcDest = Inst.getOperand(0).getReg();
-    unsigned Src1 = Inst.getOperand(2).getReg();
-    unsigned Src2 = Inst.getOperand(3).getReg();
+    MCRegister SrcDest = Inst.getOperand(0).getReg();
+    MCRegister Src1 = Inst.getOperand(2).getReg();
+    MCRegister Src2 = Inst.getOperand(3).getReg();
     if (SrcDest == Src1 || SrcDest == Src2 || Src1 == Src2)
       return Error(Ops[0]->getStartLoc(), "all tmm registers must be distinct");
   }
@@ -3956,14 +3958,14 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
   // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to
   // check this with the legacy encoding, VEX/EVEX/XOP don't use REX.
   if ((TSFlags & X86II::EncodingMask) == 0) {
-    MCPhysReg HReg = X86::NoRegister;
+    MCRegister HReg;
     bool UsesRex = TSFlags & X86II::REX_W;
     unsigned NumOps = Inst.getNumOperands();
     for (unsigned i = 0; i != NumOps; ++i) {
       const MCOperand &MO = Inst.getOperand(i);
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      MCRegister Reg = MO.getReg();
       if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
         HReg = Reg;
       if (X86II::isX86_64NonExtLowByteReg(Reg) ||
@@ -3971,7 +3973,7 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
         UsesRex = true;
     }
 
-    if (UsesRex && HReg != X86::NoRegister) {
+    if (UsesRex && HReg) {
       StringRef RegName = X86IntelInstPrinter::getRegisterName(HReg);
       return Error(Ops[0]->getStartLoc(),
                    "can't encode '" + RegName + "' in an instruction requiring "
@@ -4022,7 +4024,7 @@ void X86AsmParser::applyLVICFIMitigation(MCInst &Inst, MCStreamer &Out) {
   case X86::RETI64: {
     MCInst ShlInst, FenceInst;
     bool Parse32 = is32BitMode() || Code16GCC;
-    unsigned Basereg =
+    MCRegister Basereg =
         is64BitMode() ? X86::RSP : (Parse32 ? X86::ESP : X86::SP);
     const MCExpr *Disp = MCConstantExpr::create(0, getContext());
     auto ShlMemOp = X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 988966fa6a6c4..6cf37836f921d 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1549,6 +1549,19 @@ def ProcessorFeatures {
                                                   FeatureVPOPCNTDQ];
   list<SubtargetFeature> ZN4Features =
     !listconcat(ZN3Features, ZN4AdditionalFeatures);
+
+
+  list<SubtargetFeature> ZN5Tuning = ZN4Tuning;
+  list<SubtargetFeature> ZN5AdditionalFeatures = [FeatureVNNI,
+                                                  FeatureMOVDIRI,
+                                                  FeatureMOVDIR64B,
+                                                  FeatureVP2INTERSECT,
+                                                  FeaturePREFETCHI,
+                                                  FeatureAVXVNNI
+                                                  ];
+  list<SubtargetFeature> ZN5Features =
+    !listconcat(ZN4Features, ZN5AdditionalFeatures);
+
 }
 
 //===----------------------------------------------------------------------===//
@@ -1898,6 +1911,8 @@ def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
                 ProcessorFeatures.ZN3Tuning>;
 def : ProcModel<"znver4", Znver4Model, ProcessorFeatures.ZN4Features,
            ProcessorFeatures.ZN4Tuning>;
+def : ProcModel<"znver5", Znver4Model, ProcessorFeatures.ZN5Features,
+                ProcessorFeatures.ZN5Tuning>;
 
 def : Proc<"geode",           [FeatureX87, FeatureCX8, FeatureMMX, FeaturePRFCHW],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3c5b952ff62e2..3597b864705ef 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9927,11 +9927,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const SDLoc &dl);
 
 // X86 has dedicated shuffle that can be lowered to VEXPAND
-static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
-                                    const APInt &Zeroable,
-                                    ArrayRef<int> Mask, SDValue &V1,
-                                    SDValue &V2, SelectionDAG &DAG,
-                                    const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1,
+                                      SDValue V2, ArrayRef<int> Mask,
+                                      const APInt &Zeroable,
+                                      const X86Subtarget &Subtarget,
+                                      SelectionDAG &DAG) {
   bool IsLeftZeroSide = true;
   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
                                 IsLeftZeroSide))
@@ -15966,8 +15966,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // If we have VLX support, we can use VEXPAND.
   if (Subtarget.hasVLX())
-    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
-                                         DAG, Subtarget))
+    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
+                                           Zeroable, Subtarget, DAG))
       return V;
 
   // If we have AVX2 then we always want to lower with a blend because an v4 we
@@ -16046,8 +16046,8 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
       return Rotate;
 
-    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
-                                         DAG, Subtarget))
+    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
+                                           Zeroable, Subtarget, DAG))
       return V;
   }
 
@@ -16184,8 +16184,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // If we have VLX support, we can use VEXPAND.
   if (Subtarget.hasVLX())
-    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
-                                         DAG, Subtarget))
+    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
+                                           Zeroable, Subtarget, DAG))
       return V;
 
   // Try to match an interleave of two v8f32s and lower them as unpck and
@@ -16308,8 +16308,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
       return Rotate;
 
-    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
-                                         DAG, Subtarget))
+    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
+                                           Zeroable, Subtarget, DAG))
       return V;
   }
 
@@ -16827,8 +16827,8 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                           Zeroable, Subtarget, DAG))
     return Op;
 
-  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
-                                       DAG, Subtarget))
+  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
+                                         Subtarget, DAG))
     return V;
 
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
@@ -16898,8 +16898,8 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   }
 
   // If we have AVX512F support, we can use VEXPAND.
-  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
-                                             V1, V2, DAG, Subtarget))
+  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
+                                         Zeroable, Subtarget, DAG))
     return V;
 
   return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
@@ -16967,8 +16967,8 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Unpck;
 
   // If we have AVX512F support, we can use VEXPAND.
-  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
-                                       DAG, Subtarget))
+  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
+                                         Subtarget, DAG))
     return V;
 
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
@@ -17064,8 +17064,8 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return V;
 
   // If we have AVX512F support, we can use VEXPAND.
-  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
-                                       DAG, Subtarget))
+  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
+                                         Zeroable, Subtarget, DAG))
     return V;
 
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
diff --git a/llvm/lib/Target/X86/X86PfmCounters.td b/llvm/lib/Target/X86/X86PfmCounters.td
index 2b1dac411c992..c30e989cdc2af 100644
--- a/llvm/lib/Target/X86/X86PfmCounters.td
+++ b/llvm/lib/Target/X86/X86PfmCounters.td
@@ -350,3 +350,4 @@ def ZnVer4PfmCounters : ProcPfmCounters {
   let ValidationCounters = DefaultAMDPfmValidationCounters;
 }
 def : PfmCountersBinding<"znver4", ZnVer4PfmCounters>;
+def : PfmCountersBinding<"znver5", ZnVer4PfmCounters>;
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 986b9a211ce6c..b2c4f9ee00293 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1151,6 +1151,25 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
       break; //  "znver4"
     }
     break; // family 19h
+  case 26:
+    CPU = "znver5";
+    *Type = X86::AMDFAM1AH;
+    if (Model <= 0x77) {
+      // Models 00h-0Fh (Breithorn).
+      // Models 10h-1Fh (Breithorn-Dense).
+      // Models 20h-2Fh (Strix 1).
+      // Models 30h-37h (Strix 2).
+      // Models 38h-3Fh (Strix 3).
+      // Models 40h-4Fh (Granite Ridge).
+      // Models 50h-5Fh (Weisshorn).
+      // Models 60h-6Fh (Krackan1).
+      // Models 70h-77h (Sarlak).
+      CPU = "znver5";
+      *Subtype = X86::AMDFAM1AH_ZNVER5;
+      break; //  "znver5"
+    }
+    break;
+
   default:
     break; // Unknown AMD CPU.
   }
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 57bda0651ea82..09d4312918acf 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -238,6 +238,10 @@ static constexpr FeatureBitset FeaturesZNVER4 =
     FeatureAVX512BITALG | FeatureAVX512VPOPCNTDQ | FeatureAVX512BF16 |
     FeatureGFNI | FeatureSHSTK;
 
+static constexpr FeatureBitset FeaturesZNVER5 =
+    FeaturesZNVER4 | FeatureAVXVNNI | FeatureMOVDIRI | FeatureMOVDIR64B |
+    FeatureAVX512VP2INTERSECT | FeaturePREFETCHI | FeatureAVXVNNI;
+
 // D151696 tranplanted Mangling and OnlyForCPUDispatchSpecific from
 // X86TargetParser.def to here. They are assigned by following ways:
 // 1. Copy the mangling from the original CPU_SPEICIFC MACROs. If no, assign
@@ -417,6 +421,7 @@ constexpr ProcInfo Processors[] = {
   { {"znver2"}, CK_ZNVER2, FEATURE_AVX2, FeaturesZNVER2, '\0', false },
   { {"znver3"}, CK_ZNVER3, FEATURE_AVX2, FeaturesZNVER3, '\0', false },
   { {"znver4"}, CK_ZNVER4, FEATURE_AVX512VBMI2, FeaturesZNVER4, '\0', false },
+  { {"znver5"}, CK_ZNVER5, FEATURE_AVX512VP2INTERSECT, FeaturesZNVER5, '\0', false },
   // Generic 64-bit processor.
   { {"x86-64"}, CK_x86_64, FEATURE_SSE2 , FeaturesX86_64, '\0', false },
   { {"x86-64-v2"}, CK_x86_64_v2, FEATURE_SSE4_2 , FeaturesX86_64_V2, '\0', false },
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 13b6680264c87..5f8efd1a8f32e 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Transforms/Coroutines/CoroEarly.h"
 #include "CoroInternal.h"
+#include "CoroShape.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 891798f53b2d0..fcbd31878bdea 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -12,6 +12,7 @@
 #define LLVM_LIB_TRANSFORMS_COROUTINES_COROINTERNAL_H
 
 #include "CoroInstr.h"
+#include "CoroShape.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IRBuilder.h"
 
@@ -58,229 +59,6 @@ struct LowererBase {
   CallInst *makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt);
 };
 
-enum class ABI {
-  /// The "resume-switch" lowering, where there are separate resume and
-  /// destroy functions that are shared between all suspend points.  The
-  /// coroutine frame implicitly stores the resume and destroy functions,
-  /// the current index, and any promise value.
-  Switch,
-
-  /// The "returned-continuation" lowering, where each suspend point creates a
-  /// single continuation function that is used for both resuming and
-  /// destroying.  Does not support promises.
-  Retcon,
-
-  /// The "unique returned-continuation" lowering, where each suspend point
-  /// creates a single continuation function that is used for both resuming
-  /// and destroying.  Does not support promises.  The function is known to
-  /// suspend at most once during its execution, and the return value of
-  /// the continuation is void.
-  RetconOnce,
-
-  /// The "async continuation" lowering, where each suspend point creates a
-  /// single continuation function. The continuation function is available as an
-  /// intrinsic.
-  Async,
-};
-
-// Holds structural Coroutine Intrinsics for a particular function and other
-// values used during CoroSplit pass.
-struct LLVM_LIBRARY_VISIBILITY Shape {
-  CoroBeginInst *CoroBegin;
-  SmallVector<AnyCoroEndInst *, 4> CoroEnds;
-  SmallVector<CoroSizeInst *, 2> CoroSizes;
-  SmallVector<CoroAlignInst *, 2> CoroAligns;
-  SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
-  SmallVector<CallInst*, 2> SwiftErrorOps;
-  SmallVector<CoroAwaitSuspendInst *, 4> CoroAwaitSuspends;
-  SmallVector<CallInst *, 2> SymmetricTransfers;
-
-  // Field indexes for special fields in the switch lowering.
-  struct SwitchFieldIndex {
-    enum {
-      Resume,
-      Destroy
-
-      // The promise field is always at a fixed offset from the start of
-      // frame given its type, but the index isn't a constant for all
-      // possible frames.
-
-      // The switch-index field isn't at a fixed offset or index, either;
-      // we just work it in where it fits best.
-    };
-  };
-
-  coro::ABI ABI;
-
-  StructType *FrameTy;
-  Align FrameAlign;
-  uint64_t FrameSize;
-  Value *FramePtr;
-  BasicBlock *AllocaSpillBlock;
-
-  /// This would only be true if optimization are enabled.
-  bool OptimizeFrame;
-
-  struct SwitchLoweringStorage {
-    SwitchInst *ResumeSwitch;
-    AllocaInst *PromiseAlloca;
-    BasicBlock *ResumeEntryBlock;
-    unsigned IndexField;
-    unsigned IndexAlign;
-    unsigned IndexOffset;
-    bool HasFinalSuspend;
-    bool HasUnwindCoroEnd;
-  };
-
-  struct RetconLoweringStorage {
-    Function *ResumePrototype;
-    Function *Alloc;
-    Function *Dealloc;
-    BasicBlock *ReturnBlock;
-    bool IsFrameInlineInStorage;
-  };
-
-  struct AsyncLoweringStorage {
-    Value *Context;
-    CallingConv::ID AsyncCC;
-    unsigned ContextArgNo;
-    uint64_t ContextHeaderSize;
-    uint64_t ContextAlignment;
-    uint64_t FrameOffset; // Start of the frame.
-    uint64_t ContextSize; // Includes frame size.
-    GlobalVariable *AsyncFuncPointer;
-
-    Align getContextAlignment() const { return Align(ContextAlignment); }
-  };
-
-  union {
-    SwitchLoweringStorage SwitchLowering;
-    RetconLoweringStorage RetconLowering;
-    AsyncLoweringStorage AsyncLowering;
-  };
-
-  CoroIdInst *getSwitchCoroId() const {
-    assert(ABI == coro::ABI::Switch);
-    return cast<CoroIdInst>(CoroBegin->getId());
-  }
-
-  AnyCoroIdRetconInst *getRetconCoroId() const {
-    assert(ABI == coro::ABI::Retcon ||
-           ABI == coro::ABI::RetconOnce);
-    return cast<AnyCoroIdRetconInst>(CoroBegin->getId());
-  }
-
-  CoroIdAsyncInst *getAsyncCoroId() const {
-    assert(ABI == coro::ABI::Async);
-    return cast<CoroIdAsyncInst>(CoroBegin->getId());
-  }
-
-  unsigned getSwitchIndexField() const {
-    assert(ABI == coro::ABI::Switch);
-    assert(FrameTy && "frame type not assigned");
-    return SwitchLowering.IndexField;
-  }
-  IntegerType *getIndexType() const {
-    assert(ABI == coro::ABI::Switch);
-    assert(FrameTy && "frame type not assigned");
-    return cast<IntegerType>(FrameTy->getElementType(getSwitchIndexField()));
-  }
-  ConstantInt *getIndex(uint64_t Value) const {
-    return ConstantInt::get(getIndexType(), Value);
-  }
-
-  PointerType *getSwitchResumePointerType() const {
-    assert(ABI == coro::ABI::Switch);
-  assert(FrameTy && "frame type not assigned");
-  return cast<PointerType>(FrameTy->getElementType(SwitchFieldIndex::Resume));
-  }
-
-  FunctionType *getResumeFunctionType() const {
-    switch (ABI) {
-    case coro::ABI::Switch:
-      return FunctionType::get(Type::getVoidTy(FrameTy->getContext()),
-                               PointerType::getUnqual(FrameTy->getContext()),
-                               /*IsVarArg=*/false);
-    case coro::ABI::Retcon:
-    case coro::ABI::RetconOnce:
-      return RetconLowering.ResumePrototype->getFunctionType();
-    case coro::ABI::Async:
-      // Not used. The function type depends on the active suspend.
-      return nullptr;
-    }
-
-    llvm_unreachable("Unknown coro::ABI enum");
-  }
-
-  ArrayRef<Type*> getRetconResultTypes() const {
-    assert(ABI == coro::ABI::Retcon ||
-           ABI == coro::ABI::RetconOnce);
-    auto FTy = CoroBegin->getFunction()->getFunctionType();
-
-    // The safety of all this is checked by checkWFRetconPrototype.
-    if (auto STy = dyn_cast<StructType>(FTy->getReturnType())) {
-      return STy->elements().slice(1);
-    } else {
-      return ArrayRef<Type*>();
-    }
-  }
-
-  ArrayRef<Type*> getRetconResumeTypes() const {
-    assert(ABI == coro::ABI::Retcon ||
-           ABI == coro::ABI::RetconOnce);
-
-    // The safety of all this is checked by checkWFRetconPrototype.
-    auto FTy = RetconLowering.ResumePrototype->getFunctionType();
-    return FTy->params().slice(1);
-  }
-
-  CallingConv::ID getResumeFunctionCC() const {
-    switch (ABI) {
-    case coro::ABI::Switch:
-      return CallingConv::Fast;
-
-    case coro::ABI::Retcon:
-    case coro::ABI::RetconOnce:
-      return RetconLowering.ResumePrototype->getCallingConv();
-    case coro::ABI::Async:
-      return AsyncLowering.AsyncCC;
-    }
-    llvm_unreachable("Unknown coro::ABI enum");
-  }
-
-  AllocaInst *getPromiseAlloca() const {
-    if (ABI == coro::ABI::Switch)
-      return SwitchLowering.PromiseAlloca;
-    return nullptr;
-  }
-
-  BasicBlock::iterator getInsertPtAfterFramePtr() const {
-    if (auto *I = dyn_cast<Instruction>(FramePtr)) {
-      BasicBlock::iterator It = std::next(I->getIterator());
-      It.setHeadBit(true); // Copy pre-RemoveDIs behaviour.
-      return It;
-    }
-    return cast<Argument>(FramePtr)->getParent()->getEntryBlock().begin();
-  }
-
-  /// Allocate memory according to the rules of the active lowering.
-  ///
-  /// \param CG - if non-null, will be updated for the new call
-  Value *emitAlloc(IRBuilder<> &Builder, Value *Size, CallGraph *CG) const;
-
-  /// Deallocate memory according to the rules of the active lowering.
-  ///
-  /// \param CG - if non-null, will be updated for the new call
-  void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const;
-
-  Shape() = default;
-  explicit Shape(Function &F, bool OptimizeFrame = false)
-      : OptimizeFrame(OptimizeFrame) {
-    buildFrom(F);
-  }
-  void buildFrom(Function &F);
-};
-
 bool defaultMaterializable(Instruction &V);
 void normalizeCoroutine(Function &F, coro::Shape &Shape,
                         TargetTransformInfo &TTI);
diff --git a/llvm/lib/Transforms/Coroutines/CoroShape.h b/llvm/lib/Transforms/Coroutines/CoroShape.h
new file mode 100644
index 0000000000000..3d1b38082173d
--- /dev/null
+++ b/llvm/lib/Transforms/Coroutines/CoroShape.h
@@ -0,0 +1,288 @@
+//===- CoroShape.h - Coroutine info for lowering --------------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file declares the shape info struct that is required by many coroutine
+// utility methods.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_COROUTINES_COROSHAPE_H
+#define LLVM_TRANSFORMS_COROUTINES_COROSHAPE_H
+
+#include "CoroInstr.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class CallGraph;
+
+namespace coro {
+
+enum class ABI {
+  /// The "resume-switch" lowering, where there are separate resume and
+  /// destroy functions that are shared between all suspend points.  The
+  /// coroutine frame implicitly stores the resume and destroy functions,
+  /// the current index, and any promise value.
+  Switch,
+
+  /// The "returned-continuation" lowering, where each suspend point creates a
+  /// single continuation function that is used for both resuming and
+  /// destroying.  Does not support promises.
+  Retcon,
+
+  /// The "unique returned-continuation" lowering, where each suspend point
+  /// creates a single continuation function that is used for both resuming
+  /// and destroying.  Does not support promises.  The function is known to
+  /// suspend at most once during its execution, and the return value of
+  /// the continuation is void.
+  RetconOnce,
+
+  /// The "async continuation" lowering, where each suspend point creates a
+  /// single continuation function. The continuation function is available as an
+  /// intrinsic.
+  Async,
+};
+
+// Holds structural Coroutine Intrinsics for a particular function and other
+// values used during CoroSplit pass.
+struct LLVM_LIBRARY_VISIBILITY Shape {
+  CoroBeginInst *CoroBegin = nullptr;
+  SmallVector<AnyCoroEndInst *, 4> CoroEnds;
+  SmallVector<CoroSizeInst *, 2> CoroSizes;
+  SmallVector<CoroAlignInst *, 2> CoroAligns;
+  SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
+  SmallVector<CoroAwaitSuspendInst *, 4> CoroAwaitSuspends;
+  SmallVector<CallInst *, 2> SymmetricTransfers;
+
+  // Values invalidated by invalidateCoroutine() and tidyCoroutine()
+  SmallVector<CoroFrameInst *, 8> CoroFrames;
+  SmallVector<CoroSaveInst *, 2> UnusedCoroSaves;
+
+  // Values invalidated by replaceSwiftErrorOps()
+  SmallVector<CallInst *, 2> SwiftErrorOps;
+
+  void clear() {
+    CoroBegin = nullptr;
+    CoroEnds.clear();
+    CoroSizes.clear();
+    CoroAligns.clear();
+    CoroSuspends.clear();
+    CoroAwaitSuspends.clear();
+    SymmetricTransfers.clear();
+
+    CoroFrames.clear();
+    UnusedCoroSaves.clear();
+
+    SwiftErrorOps.clear();
+
+    FrameTy = nullptr;
+    FramePtr = nullptr;
+    AllocaSpillBlock = nullptr;
+  }
+
+  // Scan the function and collect the above intrinsics for later processing
+  void analyze(Function &F);
+  // If for some reason, we were not able to find coro.begin, bailout.
+  void invalidateCoroutine(Function &F);
+  // Perform ABI related initial transformation
+  void initABI();
+  // Remove orphaned and unnecessary intrinsics
+  void tidyCoroutine();
+
+  // Field indexes for special fields in the switch lowering.
+  struct SwitchFieldIndex {
+    enum {
+      Resume,
+      Destroy
+
+      // The promise field is always at a fixed offset from the start of
+      // frame given its type, but the index isn't a constant for all
+      // possible frames.
+
+      // The switch-index field isn't at a fixed offset or index, either;
+      // we just work it in where it fits best.
+    };
+  };
+
+  coro::ABI ABI;
+
+  StructType *FrameTy = nullptr;
+  Align FrameAlign;
+  uint64_t FrameSize = 0;
+  Value *FramePtr = nullptr;
+  BasicBlock *AllocaSpillBlock = nullptr;
+
+  /// This would only be true if optimization are enabled.
+  bool OptimizeFrame;
+
+  struct SwitchLoweringStorage {
+    SwitchInst *ResumeSwitch;
+    AllocaInst *PromiseAlloca;
+    BasicBlock *ResumeEntryBlock;
+    unsigned IndexField;
+    unsigned IndexAlign;
+    unsigned IndexOffset;
+    bool HasFinalSuspend;
+    bool HasUnwindCoroEnd;
+  };
+
+  struct RetconLoweringStorage {
+    Function *ResumePrototype;
+    Function *Alloc;
+    Function *Dealloc;
+    BasicBlock *ReturnBlock;
+    bool IsFrameInlineInStorage;
+  };
+
+  struct AsyncLoweringStorage {
+    Value *Context;
+    CallingConv::ID AsyncCC;
+    unsigned ContextArgNo;
+    uint64_t ContextHeaderSize;
+    uint64_t ContextAlignment;
+    uint64_t FrameOffset; // Start of the frame.
+    uint64_t ContextSize; // Includes frame size.
+    GlobalVariable *AsyncFuncPointer;
+
+    Align getContextAlignment() const { return Align(ContextAlignment); }
+  };
+
+  union {
+    SwitchLoweringStorage SwitchLowering;
+    RetconLoweringStorage RetconLowering;
+    AsyncLoweringStorage AsyncLowering;
+  };
+
+  CoroIdInst *getSwitchCoroId() const {
+    assert(ABI == coro::ABI::Switch);
+    return cast<CoroIdInst>(CoroBegin->getId());
+  }
+
+  AnyCoroIdRetconInst *getRetconCoroId() const {
+    assert(ABI == coro::ABI::Retcon || ABI == coro::ABI::RetconOnce);
+    return cast<AnyCoroIdRetconInst>(CoroBegin->getId());
+  }
+
+  CoroIdAsyncInst *getAsyncCoroId() const {
+    assert(ABI == coro::ABI::Async);
+    return cast<CoroIdAsyncInst>(CoroBegin->getId());
+  }
+
+  unsigned getSwitchIndexField() const {
+    assert(ABI == coro::ABI::Switch);
+    assert(FrameTy && "frame type not assigned");
+    return SwitchLowering.IndexField;
+  }
+  IntegerType *getIndexType() const {
+    assert(ABI == coro::ABI::Switch);
+    assert(FrameTy && "frame type not assigned");
+    return cast<IntegerType>(FrameTy->getElementType(getSwitchIndexField()));
+  }
+  ConstantInt *getIndex(uint64_t Value) const {
+    return ConstantInt::get(getIndexType(), Value);
+  }
+
+  PointerType *getSwitchResumePointerType() const {
+    assert(ABI == coro::ABI::Switch);
+    assert(FrameTy && "frame type not assigned");
+    return cast<PointerType>(FrameTy->getElementType(SwitchFieldIndex::Resume));
+  }
+
+  FunctionType *getResumeFunctionType() const {
+    switch (ABI) {
+    case coro::ABI::Switch:
+      return FunctionType::get(Type::getVoidTy(FrameTy->getContext()),
+                               PointerType::getUnqual(FrameTy->getContext()),
+                               /*IsVarArg=*/false);
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      return RetconLowering.ResumePrototype->getFunctionType();
+    case coro::ABI::Async:
+      // Not used. The function type depends on the active suspend.
+      return nullptr;
+    }
+
+    llvm_unreachable("Unknown coro::ABI enum");
+  }
+
+  ArrayRef<Type *> getRetconResultTypes() const {
+    assert(ABI == coro::ABI::Retcon || ABI == coro::ABI::RetconOnce);
+    auto FTy = CoroBegin->getFunction()->getFunctionType();
+
+    // The safety of all this is checked by checkWFRetconPrototype.
+    if (auto STy = dyn_cast<StructType>(FTy->getReturnType())) {
+      return STy->elements().slice(1);
+    } else {
+      return ArrayRef<Type *>();
+    }
+  }
+
+  ArrayRef<Type *> getRetconResumeTypes() const {
+    assert(ABI == coro::ABI::Retcon || ABI == coro::ABI::RetconOnce);
+
+    // The safety of all this is checked by checkWFRetconPrototype.
+    auto FTy = RetconLowering.ResumePrototype->getFunctionType();
+    return FTy->params().slice(1);
+  }
+
+  CallingConv::ID getResumeFunctionCC() const {
+    switch (ABI) {
+    case coro::ABI::Switch:
+      return CallingConv::Fast;
+
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      return RetconLowering.ResumePrototype->getCallingConv();
+    case coro::ABI::Async:
+      return AsyncLowering.AsyncCC;
+    }
+    llvm_unreachable("Unknown coro::ABI enum");
+  }
+
+  AllocaInst *getPromiseAlloca() const {
+    if (ABI == coro::ABI::Switch)
+      return SwitchLowering.PromiseAlloca;
+    return nullptr;
+  }
+
+  BasicBlock::iterator getInsertPtAfterFramePtr() const {
+    if (auto *I = dyn_cast<Instruction>(FramePtr)) {
+      BasicBlock::iterator It = std::next(I->getIterator());
+      It.setHeadBit(true); // Copy pre-RemoveDIs behaviour.
+      return It;
+    }
+    return cast<Argument>(FramePtr)->getParent()->getEntryBlock().begin();
+  }
+
+  /// Allocate memory according to the rules of the active lowering.
+  ///
+  /// \param CG - if non-null, will be updated for the new call
+  Value *emitAlloc(IRBuilder<> &Builder, Value *Size, CallGraph *CG) const;
+
+  /// Deallocate memory according to the rules of the active lowering.
+  ///
+  /// \param CG - if non-null, will be updated for the new call
+  void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const;
+
+  Shape() = default;
+  explicit Shape(Function &F, bool OptimizeFrame = false)
+      : OptimizeFrame(OptimizeFrame) {
+    analyze(F);
+    if (!CoroBegin) {
+      invalidateCoroutine(F);
+      return;
+    }
+    initABI();
+    tidyCoroutine();
+  }
+};
+
+} // end namespace coro
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_COROUTINES_COROSHAPE_H
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index cdc442bc819c3..c1042b21883f6 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -12,6 +12,7 @@
 
 #include "CoroInstr.h"
 #include "CoroInternal.h"
+#include "CoroShape.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"
@@ -176,17 +177,6 @@ void coro::suppressCoroAllocs(LLVMContext &Context,
   }
 }
 
-static void clear(coro::Shape &Shape) {
-  Shape.CoroBegin = nullptr;
-  Shape.CoroEnds.clear();
-  Shape.CoroSizes.clear();
-  Shape.CoroSuspends.clear();
-
-  Shape.FrameTy = nullptr;
-  Shape.FramePtr = nullptr;
-  Shape.AllocaSpillBlock = nullptr;
-}
-
 static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin,
                                     CoroSuspendInst *SuspendInst) {
   Module *M = SuspendInst->getModule();
@@ -199,13 +189,12 @@ static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin,
 }
 
 // Collect "interesting" coroutine intrinsics.
-void coro::Shape::buildFrom(Function &F) {
+void coro::Shape::analyze(Function &F) {
+  clear();
+
   bool HasFinalSuspend = false;
   bool HasUnwindCoroEnd = false;
   size_t FinalSuspendIndex = 0;
-  clear(*this);
-  SmallVector<CoroFrameInst *, 8> CoroFrames;
-  SmallVector<CoroSaveInst *, 2> UnusedCoroSaves;
 
   for (Instruction &I : instructions(F)) {
     // FIXME: coro_await_suspend_* are not proper `IntrinisicInst`s
@@ -297,8 +286,58 @@ void coro::Shape::buildFrom(Function &F) {
     }
   }
 
-  // If for some reason, we were not able to find coro.begin, bailout.
-  if (!CoroBegin) {
+  // If there is no CoroBegin then this is not a coroutine.
+  if (!CoroBegin)
+    return;
+
+  // Determination of ABI and initializing lowering info
+  auto Id = CoroBegin->getId();
+  auto IntrID = Id->getIntrinsicID();
+  if (IntrID == Intrinsic::coro_id) {
+    ABI = coro::ABI::Switch;
+    SwitchLowering.HasFinalSuspend = HasFinalSuspend;
+    SwitchLowering.HasUnwindCoroEnd = HasUnwindCoroEnd;
+
+    auto SwitchId = getSwitchCoroId();
+    SwitchLowering.ResumeSwitch = nullptr;
+    SwitchLowering.PromiseAlloca = SwitchId->getPromise();
+    SwitchLowering.ResumeEntryBlock = nullptr;
+
+    // Move final suspend to the last element in the CoroSuspends vector.
+    if (SwitchLowering.HasFinalSuspend &&
+        FinalSuspendIndex != CoroSuspends.size() - 1)
+      std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back());
+  } else if (IntrID == Intrinsic::coro_id_async) {
+    ABI = coro::ABI::Async;
+    auto *AsyncId = getAsyncCoroId();
+    AsyncId->checkWellFormed();
+    AsyncLowering.Context = AsyncId->getStorage();
+    AsyncLowering.ContextArgNo = AsyncId->getStorageArgumentIndex();
+    AsyncLowering.ContextHeaderSize = AsyncId->getStorageSize();
+    AsyncLowering.ContextAlignment = AsyncId->getStorageAlignment().value();
+    AsyncLowering.AsyncFuncPointer = AsyncId->getAsyncFunctionPointer();
+    AsyncLowering.AsyncCC = F.getCallingConv();
+  } else if (IntrID == Intrinsic::coro_id_retcon ||
+             IntrID == Intrinsic::coro_id_retcon_once) {
+    ABI = IntrID == Intrinsic::coro_id_retcon ? coro::ABI::Retcon
+                                              : coro::ABI::RetconOnce;
+    auto ContinuationId = getRetconCoroId();
+    ContinuationId->checkWellFormed();
+    auto Prototype = ContinuationId->getPrototype();
+    RetconLowering.ResumePrototype = Prototype;
+    RetconLowering.Alloc = ContinuationId->getAllocFunction();
+    RetconLowering.Dealloc = ContinuationId->getDeallocFunction();
+    RetconLowering.ReturnBlock = nullptr;
+    RetconLowering.IsFrameInlineInStorage = false;
+  } else {
+    llvm_unreachable("coro.begin is not dependent on a coro.id call");
+  }
+}
+
+// If for some reason, we were not able to find coro.begin, bailout.
+void coro::Shape::invalidateCoroutine(Function &F) {
+  assert(!CoroBegin);
+  {
     // Replace coro.frame which are supposed to be lowered to the result of
     // coro.begin with undef.
     auto *Undef = UndefValue::get(PointerType::get(F.getContext(), 0));
@@ -319,21 +358,13 @@ void coro::Shape::buildFrom(Function &F) {
     // Replace all coro.ends with unreachable instruction.
     for (AnyCoroEndInst *CE : CoroEnds)
       changeToUnreachable(CE);
-
-    return;
   }
+}
 
-  auto Id = CoroBegin->getId();
-  switch (auto IdIntrinsic = Id->getIntrinsicID()) {
-  case Intrinsic::coro_id: {
-    auto SwitchId = cast<CoroIdInst>(Id);
-    this->ABI = coro::ABI::Switch;
-    this->SwitchLowering.HasFinalSuspend = HasFinalSuspend;
-    this->SwitchLowering.HasUnwindCoroEnd = HasUnwindCoroEnd;
-    this->SwitchLowering.ResumeSwitch = nullptr;
-    this->SwitchLowering.PromiseAlloca = SwitchId->getPromise();
-    this->SwitchLowering.ResumeEntryBlock = nullptr;
-
+// Perform semantic checking and initialization of the ABI
+void coro::Shape::initABI() {
+  switch (ABI) {
+  case coro::ABI::Switch: {
     for (auto *AnySuspend : CoroSuspends) {
       auto Suspend = dyn_cast<CoroSuspendInst>(AnySuspend);
       if (!Suspend) {
@@ -348,33 +379,11 @@ void coro::Shape::buildFrom(Function &F) {
     }
     break;
   }
-  case Intrinsic::coro_id_async: {
-    auto *AsyncId = cast<CoroIdAsyncInst>(Id);
-    AsyncId->checkWellFormed();
-    this->ABI = coro::ABI::Async;
-    this->AsyncLowering.Context = AsyncId->getStorage();
-    this->AsyncLowering.ContextArgNo = AsyncId->getStorageArgumentIndex();
-    this->AsyncLowering.ContextHeaderSize = AsyncId->getStorageSize();
-    this->AsyncLowering.ContextAlignment =
-        AsyncId->getStorageAlignment().value();
-    this->AsyncLowering.AsyncFuncPointer = AsyncId->getAsyncFunctionPointer();
-    this->AsyncLowering.AsyncCC = F.getCallingConv();
+  case coro::ABI::Async: {
     break;
   };
-  case Intrinsic::coro_id_retcon:
-  case Intrinsic::coro_id_retcon_once: {
-    auto ContinuationId = cast<AnyCoroIdRetconInst>(Id);
-    ContinuationId->checkWellFormed();
-    this->ABI = (IdIntrinsic == Intrinsic::coro_id_retcon
-                  ? coro::ABI::Retcon
-                  : coro::ABI::RetconOnce);
-    auto Prototype = ContinuationId->getPrototype();
-    this->RetconLowering.ResumePrototype = Prototype;
-    this->RetconLowering.Alloc = ContinuationId->getAllocFunction();
-    this->RetconLowering.Dealloc = ContinuationId->getDeallocFunction();
-    this->RetconLowering.ReturnBlock = nullptr;
-    this->RetconLowering.IsFrameInlineInStorage = false;
-
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
     // Determine the result value types, and make sure they match up with
     // the values passed to the suspends.
     auto ResultTys = getRetconResultTypes();
@@ -407,7 +416,7 @@ void coro::Shape::buildFrom(Function &F) {
 
 #ifndef NDEBUG
           Suspend->dump();
-          Prototype->getFunctionType()->dump();
+          RetconLowering.ResumePrototype->getFunctionType()->dump();
 #endif
           report_fatal_error("argument to coro.suspend.retcon does not "
                              "match corresponding prototype function result");
@@ -416,14 +425,14 @@ void coro::Shape::buildFrom(Function &F) {
       if (SI != SE || RI != RE) {
 #ifndef NDEBUG
         Suspend->dump();
-        Prototype->getFunctionType()->dump();
+        RetconLowering.ResumePrototype->getFunctionType()->dump();
 #endif
         report_fatal_error("wrong number of arguments to coro.suspend.retcon");
       }
 
       // Check that the result type of the suspend matches the resume types.
       Type *SResultTy = Suspend->getType();
-      ArrayRef<Type*> SuspendResultTys;
+      ArrayRef<Type *> SuspendResultTys;
       if (SResultTy->isVoidTy()) {
         // leave as empty array
       } else if (auto SResultStructTy = dyn_cast<StructType>(SResultTy)) {
@@ -435,7 +444,7 @@ void coro::Shape::buildFrom(Function &F) {
       if (SuspendResultTys.size() != ResumeTys.size()) {
 #ifndef NDEBUG
         Suspend->dump();
-        Prototype->getFunctionType()->dump();
+        RetconLowering.ResumePrototype->getFunctionType()->dump();
 #endif
         report_fatal_error("wrong number of results from coro.suspend.retcon");
       }
@@ -443,7 +452,7 @@ void coro::Shape::buildFrom(Function &F) {
         if (SuspendResultTys[I] != ResumeTys[I]) {
 #ifndef NDEBUG
           Suspend->dump();
-          Prototype->getFunctionType()->dump();
+          RetconLowering.ResumePrototype->getFunctionType()->dump();
 #endif
           report_fatal_error("result from coro.suspend.retcon does not "
                              "match corresponding prototype function param");
@@ -452,23 +461,18 @@ void coro::Shape::buildFrom(Function &F) {
     }
     break;
   }
-
   default:
     llvm_unreachable("coro.begin is not dependent on a coro.id call");
   }
+}
 
+void coro::Shape::tidyCoroutine() {
   // The coro.free intrinsic is always lowered to the result of coro.begin.
   for (CoroFrameInst *CF : CoroFrames) {
     CF->replaceAllUsesWith(CoroBegin);
     CF->eraseFromParent();
   }
 
-  // Move final suspend to be the last element in the CoroSuspends vector.
-  if (ABI == coro::ABI::Switch &&
-      SwitchLowering.HasFinalSuspend &&
-      FinalSuspendIndex != CoroSuspends.size() - 1)
-    std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back());
-
   // Remove orphaned coro.saves.
   for (CoroSaveInst *CoroSave : UnusedCoroSaves)
     CoroSave->eraseFromParent();
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 6da019a79b727..25a14ef9a49ee 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -280,6 +280,40 @@ entry:
 }
 declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
 
+define <2 x i128> @abs_v4i128(<2 x i128> %a){
+; CHECK-SD-LABEL: abs_v4i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    asr x8, x1, #63
+; CHECK-SD-NEXT:    asr x9, x3, #63
+; CHECK-SD-NEXT:    eor x10, x0, x8
+; CHECK-SD-NEXT:    eor x11, x1, x8
+; CHECK-SD-NEXT:    subs x0, x10, x8
+; CHECK-SD-NEXT:    eor x10, x2, x9
+; CHECK-SD-NEXT:    sbc x1, x11, x8
+; CHECK-SD-NEXT:    eor x8, x3, x9
+; CHECK-SD-NEXT:    subs x2, x10, x9
+; CHECK-SD-NEXT:    sbc x3, x8, x9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: abs_v4i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    asr x8, x1, #63
+; CHECK-GI-NEXT:    asr x9, x3, #63
+; CHECK-GI-NEXT:    adds x10, x0, x8
+; CHECK-GI-NEXT:    adc x11, x1, x8
+; CHECK-GI-NEXT:    adds x12, x2, x9
+; CHECK-GI-NEXT:    eor x0, x10, x8
+; CHECK-GI-NEXT:    adc x13, x3, x9
+; CHECK-GI-NEXT:    eor x1, x11, x8
+; CHECK-GI-NEXT:    eor x2, x12, x9
+; CHECK-GI-NEXT:    eor x3, x13, x9
+; CHECK-GI-NEXT:    ret
+entry:
+  %res = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %a, i1 0)
+  ret <2 x i128> %res
+}
+declare <2 x i128> @llvm.abs.v2i128(<2 x i128>, i1)
+
 ; ===== Vectors with Non-Pow 2 Widths =====
 
 define <3 x i8> @abs_v3i8(<3 x i8> %a){
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index f8397290ab5e1..1ed2e09c6b4d4 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -33,24 +33,20 @@ define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    fmov s0, wzr
-; CHECK-NEXT:    ldr x11, [sp, #16]
+; CHECK-NEXT:    ldr x10, [sp, #16]
 ; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    ldp x9, x10, [sp]
 ; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    dup v1.4s, v0.s[0]
-; CHECK-NEXT:    mov x8, v1.d[1]
-; CHECK-NEXT:    lsr x8, x8, #32
-; CHECK-NEXT:    tst w8, #0x1
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    csel x10, x5, x10, ne
-; CHECK-NEXT:    csel x9, x4, x9, ne
-; CHECK-NEXT:    stur x9, [x11, #12]
 ; CHECK-NEXT:    tst w8, #0x1
-; CHECK-NEXT:    str w10, [x11, #20]
-; CHECK-NEXT:    csel x8, x2, x6, ne
+; CHECK-NEXT:    ldp x9, x8, [sp]
+; CHECK-NEXT:    csel x11, x2, x6, ne
+; CHECK-NEXT:    str x11, [x10]
+; CHECK-NEXT:    csel x9, x4, x9, ne
+; CHECK-NEXT:    csel x8, x5, x8, ne
+; CHECK-NEXT:    stur x9, [x10, #12]
 ; CHECK-NEXT:    csel x9, x3, x7, ne
-; CHECK-NEXT:    str x8, [x11]
-; CHECK-NEXT:    str w9, [x11, #8]
+; CHECK-NEXT:    str w8, [x10, #20]
+; CHECK-NEXT:    str w9, [x10, #8]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
   %cbool = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 8ca1e9ee5b617..baab53d8bdbd4 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1,11 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
-
-; CHECK-GI: warning: Instruction selection used fallback path for v2f128_fp128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_fp128
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define fp128 @f128_fp128(fp128 %a, fp128 %b, fp128 %d, fp128 %e) {
 ; CHECK-SD-LABEL: f128_fp128:
@@ -429,35 +426,74 @@ entry:
 }
 
 define <2 x fp128> @v2f128_fp128(<2 x fp128> %a, <2 x fp128> %b, <2 x fp128> %d, <2 x fp128> %e) {
-; CHECK-LABEL: v2f128_fp128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 112
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    stp q4, q5, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    stp q1, q3, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    stp q7, q6, [sp, #64] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.ge .LBB12_2
-; CHECK-NEXT:  // %bb.1: // %entry
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:  .LBB12_2: // %entry
-; CHECK-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.ge .LBB12_4
-; CHECK-NEXT:  // %bb.3: // %entry
-; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:  .LBB12_4: // %entry
-; CHECK-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2f128_fp128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #112
+; CHECK-SD-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q4, q5, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    stp q1, q3, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    stp q7, q6, [sp, #64] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    b.ge .LBB12_2
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:  .LBB12_2: // %entry
+; CHECK-SD-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    b.ge .LBB12_4
+; CHECK-SD-NEXT:  // %bb.3: // %entry
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:  .LBB12_4: // %entry
+; CHECK-SD-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #112
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2f128_fp128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #112
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    stp q3, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v2.16b
+; CHECK-GI-NEXT:    stp q6, q4, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    stp q7, q5, [sp, #64] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w19, #0
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov d0, v2.d[1]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fcsel d2, d2, d3, lt
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    fcsel d3, d0, d1, lt
+; CHECK-GI-NEXT:    ldp q5, q0, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    mov d4, v5.d[1]
+; CHECK-GI-NEXT:    fcsel d0, d0, d5, lt
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d3
+; CHECK-GI-NEXT:    fcsel d2, d1, d4, lt
+; CHECK-GI-NEXT:    mov v1.d[0], x9
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mov v1.d[1], x9
+; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <2 x fp128> %a, %b
   %s = select <2 x i1> %c, <2 x fp128> %d, <2 x fp128> %e
@@ -465,42 +501,104 @@ entry:
 }
 
 define <3 x fp128> @v3f128_fp128(<3 x fp128> %a, <3 x fp128> %b, <3 x fp128> %d, <3 x fp128> %e) {
-; CHECK-LABEL: v3f128_fp128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 112
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    stp q1, q4, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    mov v1.16b, v3.16b
-; CHECK-NEXT:    stp q2, q5, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT:    stp q6, q7, [sp, #64] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.lt .LBB13_2
-; CHECK-NEXT:  // %bb.1:
-; CHECK-NEXT:    ldr q0, [sp, #128]
-; CHECK-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:  .LBB13_2: // %entry
-; CHECK-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.lt .LBB13_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    ldr q0, [sp, #144]
-; CHECK-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:  .LBB13_4: // %entry
-; CHECK-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    add x8, sp, #160
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    add x9, sp, #112
-; CHECK-NEXT:    csel x8, x9, x8, lt
-; CHECK-NEXT:    ldp q0, q1, [sp, #64] // 32-byte Folded Reload
-; CHECK-NEXT:    ldr q2, [x8]
-; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3f128_fp128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #112
+; CHECK-SD-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q1, q4, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    stp q2, q5, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    stp q6, q7, [sp, #64] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    b.lt .LBB13_2
+; CHECK-SD-NEXT:  // %bb.1:
+; CHECK-SD-NEXT:    ldr q0, [sp, #128]
+; CHECK-SD-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:  .LBB13_2: // %entry
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    b.lt .LBB13_4
+; CHECK-SD-NEXT:  // %bb.3:
+; CHECK-SD-NEXT:    ldr q0, [sp, #144]
+; CHECK-SD-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:  .LBB13_4: // %entry
+; CHECK-SD-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    add x8, sp, #160
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    add x9, sp, #112
+; CHECK-SD-NEXT:    csel x8, x9, x8, lt
+; CHECK-SD-NEXT:    ldp q0, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldr q2, [x8]
+; CHECK-SD-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #112
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3f128_fp128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #192
+; CHECK-GI-NEXT:    str x30, [sp, #160] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #176] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 192
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    stp q4, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    stp q5, q2, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q2, [sp, #192]
+; CHECK-GI-NEXT:    str q2, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q2, [sp, #208]
+; CHECK-GI-NEXT:    stp q2, q6, [sp, #64] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q2, [sp, #224]
+; CHECK-GI-NEXT:    stp q7, q2, [sp, #96] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q2, [sp, #240]
+; CHECK-GI-NEXT:    str q2, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov w20, w0
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q5, q4, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w19, #0
+; CHECK-GI-NEXT:    ldp q7, q6, [sp, #96] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #160] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov d0, v4.d[1]
+; CHECK-GI-NEXT:    mov d1, v5.d[1]
+; CHECK-GI-NEXT:    fcsel d4, d4, d5, lt
+; CHECK-GI-NEXT:    mov d2, v7.d[1]
+; CHECK-GI-NEXT:    mov d3, v6.d[1]
+; CHECK-GI-NEXT:    fmov x8, d4
+; CHECK-GI-NEXT:    fcsel d5, d0, d1, lt
+; CHECK-GI-NEXT:    cmp w20, #0
+; CHECK-GI-NEXT:    fcsel d1, d7, d6, lt
+; CHECK-GI-NEXT:    ldp q7, q0, [sp, #128] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d3, d2, d3, lt
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d6, v7.d[1]
+; CHECK-GI-NEXT:    fcsel d7, d0, d7, lt
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    fmov x9, d7
+; CHECK-GI-NEXT:    fcsel d4, d2, d6, lt
+; CHECK-GI-NEXT:    mov v1.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d5
+; CHECK-GI-NEXT:    mov v2.d[0], x9
+; CHECK-GI-NEXT:    fmov x9, d3
+; CHECK-GI-NEXT:    fmov x10, d4
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mov v1.d[1], x9
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    add sp, sp, #192
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x fp128> %a, %b
   %s = select <3 x i1> %c, <3 x fp128> %d, <3 x fp128> %e
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index b00e5d6c701d8..61964060ca2c8 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i64 @i64_i64(i64 %a, i64 %b, i64 %d, i64 %e) {
 ; CHECK-LABEL: i64_i64:
@@ -1376,6 +1376,62 @@ entry:
   ret <32 x i8> %s
 }
 
+define <2 x i128> @v2i128_i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: v2i128_i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add x10, sp, #32
+; CHECK-SD-NEXT:    mov x11, sp
+; CHECK-SD-NEXT:    cmp x0, x4
+; CHECK-SD-NEXT:    orr x12, x10, #0x8
+; CHECK-SD-NEXT:    orr x13, x11, #0x8
+; CHECK-SD-NEXT:    sbcs xzr, x1, x5
+; CHECK-SD-NEXT:    add x8, sp, #48
+; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    csel x12, x13, x12, lt
+; CHECK-SD-NEXT:    csel x10, x11, x10, lt
+; CHECK-SD-NEXT:    cmp x2, x6
+; CHECK-SD-NEXT:    orr x11, x8, #0x8
+; CHECK-SD-NEXT:    orr x13, x9, #0x8
+; CHECK-SD-NEXT:    sbcs xzr, x3, x7
+; CHECK-SD-NEXT:    ldr x0, [x10]
+; CHECK-SD-NEXT:    csel x8, x9, x8, lt
+; CHECK-SD-NEXT:    csel x9, x13, x11, lt
+; CHECK-SD-NEXT:    ldr x1, [x12]
+; CHECK-SD-NEXT:    ldr x2, [x8]
+; CHECK-SD-NEXT:    ldr x3, [x9]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i128_i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp x1, x5
+; CHECK-GI-NEXT:    ldp x8, x9, [sp]
+; CHECK-GI-NEXT:    cset w10, lt
+; CHECK-GI-NEXT:    cmp x0, x4
+; CHECK-GI-NEXT:    cset w13, lo
+; CHECK-GI-NEXT:    cmp x1, x5
+; CHECK-GI-NEXT:    csel w10, w13, w10, eq
+; CHECK-GI-NEXT:    cmp x3, x7
+; CHECK-GI-NEXT:    ldp x13, x14, [sp, #32]
+; CHECK-GI-NEXT:    cset w15, lt
+; CHECK-GI-NEXT:    cmp x2, x6
+; CHECK-GI-NEXT:    ldp x11, x12, [sp, #16]
+; CHECK-GI-NEXT:    cset w16, lo
+; CHECK-GI-NEXT:    cmp x3, x7
+; CHECK-GI-NEXT:    ldp x17, x18, [sp, #48]
+; CHECK-GI-NEXT:    csel w15, w16, w15, eq
+; CHECK-GI-NEXT:    tst w10, #0x1
+; CHECK-GI-NEXT:    csel x0, x8, x13, ne
+; CHECK-GI-NEXT:    csel x1, x9, x14, ne
+; CHECK-GI-NEXT:    tst w15, #0x1
+; CHECK-GI-NEXT:    csel x2, x11, x17, ne
+; CHECK-GI-NEXT:    csel x3, x12, x18, ne
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = icmp slt <2 x i128> %a, %b
+  %s = select <2 x i1> %c, <2 x i128> %d, <2 x i128> %e
+  ret <2 x i128> %s
+}
+
 ; ===== ICMP Zero RHS =====
 
 define <8 x i1> @icmp_eq_v8i8_Zero_RHS(<8 x i8> %a) {
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 951458da17c07..7014a4a9acbe0 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i1 @shl_i1(i1 %0, i1 %1){
@@ -674,6 +674,61 @@ define <4 x i64> @shl_v4i64(<4 x i64> %0, <4 x i64> %1){
     ret <4 x i64> %3
 }
 
+define <2 x i128> @shl_v2i128(<2 x i128> %0, <2 x i128> %1){
+; CHECK-SD-LABEL: shl_v2i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsr x8, x0, #1
+; CHECK-SD-NEXT:    mvn w9, w4
+; CHECK-SD-NEXT:    lsl x10, x1, x4
+; CHECK-SD-NEXT:    mvn w12, w6
+; CHECK-SD-NEXT:    lsl x11, x0, x4
+; CHECK-SD-NEXT:    lsl x13, x3, x6
+; CHECK-SD-NEXT:    lsr x8, x8, x9
+; CHECK-SD-NEXT:    lsr x9, x2, #1
+; CHECK-SD-NEXT:    tst x4, #0x40
+; CHECK-SD-NEXT:    csel x0, xzr, x11, ne
+; CHECK-SD-NEXT:    lsr x9, x9, x12
+; CHECK-SD-NEXT:    orr x8, x10, x8
+; CHECK-SD-NEXT:    lsl x10, x2, x6
+; CHECK-SD-NEXT:    csel x1, x11, x8, ne
+; CHECK-SD-NEXT:    tst x6, #0x40
+; CHECK-SD-NEXT:    orr x8, x13, x9
+; CHECK-SD-NEXT:    csel x2, xzr, x10, ne
+; CHECK-SD-NEXT:    csel x3, x10, x8, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v2i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    sub x10, x4, #64
+; CHECK-GI-NEXT:    lsl x11, x1, x4
+; CHECK-GI-NEXT:    sub x9, x8, x4
+; CHECK-GI-NEXT:    lsl x10, x0, x10
+; CHECK-GI-NEXT:    lsl x12, x0, x4
+; CHECK-GI-NEXT:    lsr x9, x0, x9
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    sub x8, x8, x6
+; CHECK-GI-NEXT:    lsr x8, x2, x8
+; CHECK-GI-NEXT:    csel x0, x12, xzr, lo
+; CHECK-GI-NEXT:    lsl x12, x2, x6
+; CHECK-GI-NEXT:    orr x9, x9, x11
+; CHECK-GI-NEXT:    lsl x11, x3, x6
+; CHECK-GI-NEXT:    csel x9, x9, x10, lo
+; CHECK-GI-NEXT:    sub x10, x6, #64
+; CHECK-GI-NEXT:    cmp x4, #0
+; CHECK-GI-NEXT:    lsl x10, x2, x10
+; CHECK-GI-NEXT:    csel x1, x1, x9, eq
+; CHECK-GI-NEXT:    orr x8, x8, x11
+; CHECK-GI-NEXT:    cmp x6, #64
+; CHECK-GI-NEXT:    csel x2, x12, xzr, lo
+; CHECK-GI-NEXT:    csel x8, x8, x10, lo
+; CHECK-GI-NEXT:    cmp x6, #0
+; CHECK-GI-NEXT:    csel x3, x3, x8, eq
+; CHECK-GI-NEXT:    ret
+    %3 = shl <2 x i128> %0, %1
+    ret <2 x i128> %3
+}
+
 define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){
 ; CHECK-SD-LABEL: ashr_v4i8:
 ; CHECK-SD:       // %bb.0:
@@ -819,6 +874,67 @@ define <4 x i64> @ashr_v4i64(<4 x i64> %0, <4 x i64> %1){
     ret <4 x i64> %3
 }
 
+define <2 x i128> @ashr_v2i128(<2 x i128> %0, <2 x i128> %1){
+; CHECK-SD-LABEL: ashr_v2i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl x8, x1, #1
+; CHECK-SD-NEXT:    mvn w9, w4
+; CHECK-SD-NEXT:    lsl x10, x3, #1
+; CHECK-SD-NEXT:    lsr x11, x0, x4
+; CHECK-SD-NEXT:    lsr x12, x2, x6
+; CHECK-SD-NEXT:    asr x13, x1, #63
+; CHECK-SD-NEXT:    lsl x8, x8, x9
+; CHECK-SD-NEXT:    mvn w9, w6
+; CHECK-SD-NEXT:    tst x4, #0x40
+; CHECK-SD-NEXT:    lsl x9, x10, x9
+; CHECK-SD-NEXT:    asr x10, x1, x4
+; CHECK-SD-NEXT:    asr x14, x3, #63
+; CHECK-SD-NEXT:    orr x8, x8, x11
+; CHECK-SD-NEXT:    asr x11, x3, x6
+; CHECK-SD-NEXT:    csel x0, x10, x8, ne
+; CHECK-SD-NEXT:    orr x8, x9, x12
+; CHECK-SD-NEXT:    csel x1, x13, x10, ne
+; CHECK-SD-NEXT:    tst x6, #0x40
+; CHECK-SD-NEXT:    csel x2, x11, x8, ne
+; CHECK-SD-NEXT:    csel x3, x14, x11, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v2i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    sub x10, x4, #64
+; CHECK-GI-NEXT:    lsr x11, x0, x4
+; CHECK-GI-NEXT:    sub x9, x8, x4
+; CHECK-GI-NEXT:    asr x10, x1, x10
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    lsl x9, x1, x9
+; CHECK-GI-NEXT:    sub x8, x8, x6
+; CHECK-GI-NEXT:    asr x12, x1, x4
+; CHECK-GI-NEXT:    lsl x8, x3, x8
+; CHECK-GI-NEXT:    orr x9, x11, x9
+; CHECK-GI-NEXT:    asr x11, x1, #63
+; CHECK-GI-NEXT:    csel x9, x9, x10, lo
+; CHECK-GI-NEXT:    cmp x4, #0
+; CHECK-GI-NEXT:    lsr x10, x2, x6
+; CHECK-GI-NEXT:    csel x0, x0, x9, eq
+; CHECK-GI-NEXT:    sub x9, x6, #64
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    asr x9, x3, x9
+; CHECK-GI-NEXT:    csel x1, x12, x11, lo
+; CHECK-GI-NEXT:    orr x8, x10, x8
+; CHECK-GI-NEXT:    cmp x6, #64
+; CHECK-GI-NEXT:    asr x11, x3, x6
+; CHECK-GI-NEXT:    asr x10, x3, #63
+; CHECK-GI-NEXT:    csel x8, x8, x9, lo
+; CHECK-GI-NEXT:    cmp x6, #0
+; CHECK-GI-NEXT:    csel x2, x2, x8, eq
+; CHECK-GI-NEXT:    cmp x6, #64
+; CHECK-GI-NEXT:    csel x3, x11, x10, lo
+; CHECK-GI-NEXT:    ret
+    %3 = ashr <2 x i128> %0, %1
+    ret <2 x i128> %3
+}
+
 define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){
 ; CHECK-SD-LABEL: lshr_v4i8:
 ; CHECK-SD:       // %bb.0:
@@ -962,6 +1078,63 @@ define <4 x i64> @lshr_v4i64(<4 x i64> %0, <4 x i64> %1){
     ret <4 x i64> %3
 }
 
+define <2 x i128> @lshr_v2i128(<2 x i128> %0, <2 x i128> %1){
+; CHECK-SD-LABEL: lshr_v2i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl x8, x1, #1
+; CHECK-SD-NEXT:    mvn w9, w4
+; CHECK-SD-NEXT:    lsr x10, x0, x4
+; CHECK-SD-NEXT:    mvn w12, w6
+; CHECK-SD-NEXT:    lsr x11, x1, x4
+; CHECK-SD-NEXT:    lsr x13, x2, x6
+; CHECK-SD-NEXT:    lsl x8, x8, x9
+; CHECK-SD-NEXT:    lsl x9, x3, #1
+; CHECK-SD-NEXT:    tst x4, #0x40
+; CHECK-SD-NEXT:    csel x1, xzr, x11, ne
+; CHECK-SD-NEXT:    lsl x9, x9, x12
+; CHECK-SD-NEXT:    orr x8, x8, x10
+; CHECK-SD-NEXT:    lsr x10, x3, x6
+; CHECK-SD-NEXT:    csel x0, x11, x8, ne
+; CHECK-SD-NEXT:    tst x6, #0x40
+; CHECK-SD-NEXT:    orr x8, x9, x13
+; CHECK-SD-NEXT:    csel x3, xzr, x10, ne
+; CHECK-SD-NEXT:    csel x2, x10, x8, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_v2i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    sub x10, x4, #64
+; CHECK-GI-NEXT:    lsr x11, x0, x4
+; CHECK-GI-NEXT:    sub x9, x8, x4
+; CHECK-GI-NEXT:    lsr x10, x1, x10
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    lsl x9, x1, x9
+; CHECK-GI-NEXT:    sub x8, x8, x6
+; CHECK-GI-NEXT:    lsr x12, x1, x4
+; CHECK-GI-NEXT:    lsl x8, x3, x8
+; CHECK-GI-NEXT:    orr x9, x11, x9
+; CHECK-GI-NEXT:    lsr x11, x2, x6
+; CHECK-GI-NEXT:    csel x9, x9, x10, lo
+; CHECK-GI-NEXT:    cmp x4, #0
+; CHECK-GI-NEXT:    sub x10, x6, #64
+; CHECK-GI-NEXT:    csel x0, x0, x9, eq
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    lsr x9, x3, x10
+; CHECK-GI-NEXT:    csel x1, x12, xzr, lo
+; CHECK-GI-NEXT:    orr x8, x11, x8
+; CHECK-GI-NEXT:    cmp x6, #64
+; CHECK-GI-NEXT:    lsr x10, x3, x6
+; CHECK-GI-NEXT:    csel x8, x8, x9, lo
+; CHECK-GI-NEXT:    cmp x6, #0
+; CHECK-GI-NEXT:    csel x2, x2, x8, eq
+; CHECK-GI-NEXT:    cmp x6, #64
+; CHECK-GI-NEXT:    csel x3, x10, xzr, lo
+; CHECK-GI-NEXT:    ret
+    %3 = lshr <2 x i128> %0, %1
+    ret <2 x i128> %3
+}
+
 ; ===== Vector with Non-Pow 2 Width =====
 
 define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
index 6e0cfdd26a786..b0ca0069a526b 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -23,6 +23,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
 
 ; Additional tests for 64-bit divide bypass
 
diff --git a/llvm/test/CodeGen/X86/cmp16.ll b/llvm/test/CodeGen/X86/cmp16.ll
index fa9e75ff16a5c..8c14a78d9e113 100644
--- a/llvm/test/CodeGen/X86/cmp16.ll
+++ b/llvm/test/CodeGen/X86/cmp16.ll
@@ -13,6 +13,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=X64,X64-FAST
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=X64,X64-FAST
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s --check-prefixes=X64,X64-FAST
 
 define i1 @cmp16_reg_eq_reg(i16 %a0, i16 %a1) {
 ; X86-GENERIC-LABEL: cmp16_reg_eq_reg:
diff --git a/llvm/test/CodeGen/X86/cpus-amd.ll b/llvm/test/CodeGen/X86/cpus-amd.ll
index 228a00428c457..33b2cf3731478 100644
--- a/llvm/test/CodeGen/X86/cpus-amd.ll
+++ b/llvm/test/CodeGen/X86/cpus-amd.ll
@@ -29,6 +29,7 @@
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver4 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver5 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 
 define void @foo() {
   ret void
diff --git a/llvm/test/CodeGen/X86/rdpru.ll b/llvm/test/CodeGen/X86/rdpru.ll
index 7771f52653cb5..be79a4499a338 100644
--- a/llvm/test/CodeGen/X86/rdpru.ll
+++ b/llvm/test/CodeGen/X86/rdpru.ll
@@ -6,6 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 -fast-isel | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 -fast-isel | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 -fast-isel | FileCheck %s --check-prefix=X64
 
 define void @rdpru_asm() {
 ; X86-LABEL: rdpru_asm:
diff --git a/llvm/test/CodeGen/X86/shuffle-as-shifts.ll b/llvm/test/CodeGen/X86/shuffle-as-shifts.ll
index e89197f5b42c3..9c8729b3ea505 100644
--- a/llvm/test/CodeGen/X86/shuffle-as-shifts.ll
+++ b/llvm/test/CodeGen/X86/shuffle-as-shifts.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server  | FileCheck %s --check-prefixes=CHECK,CHECK-ICX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4  | FileCheck %s --check-prefixes=CHECK,CHECK-V4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
 
 
 define <4 x i32> @shuf_rot_v4i32_1032(<4 x i32> %x) {
diff --git a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
index d74d195439bda..ceef3fb4bb188 100644
--- a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -50,6 +50,7 @@
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver2        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver3        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver4        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX512
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver5        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX512
 
 ; Other chips with slow unaligned memory accesses
 
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
index 9f2071ff14b87..2b78a70ebcc26 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -6,6 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1      | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3      | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4      | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5      | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64      | FileCheck %s --check-prefixes=X86-64
 
 define float @f32_no_daz(float %f) #0 {
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll
index 7d8bb567c09b3..162ab71fc00d4 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4  | FileCheck %s --check-prefixes=CHECK,CHECK-V4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq  | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
 
 define <8 x double> @transform_VPERMILPSZrr(<8 x double> %a) nounwind {
 ; CHECK-LABEL: transform_VPERMILPSZrr:
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll
index 5d031f6017c77..cd97946da248f 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4  | FileCheck %s --check-prefixes=CHECK,CHECK-V4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq  | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
 
 define <16 x float> @transform_VPERMILPSZrr(<16 x float> %a) nounwind {
 ; CHECK-LABEL: transform_VPERMILPSZrr:
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
index 4a160bc9debc7..5ea991f85523e 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4  | FileCheck %s --check-prefixes=CHECK,CHECK-V4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq  | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
 
 
 define <16 x float> @transform_VUNPCKLPDZrr(<16 x float> %a, <16 x float> %b) nounwind {
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckps-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckps-avx512.ll
index d0e3ad9b19086..96155f0300d2d 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckps-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckps-avx512.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4  | FileCheck %s --check-prefixes=CHECK,CHECK-V4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq  | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
 
 define <16 x float> @transform_VUNPCKLPSZrr(<16 x float> %a, <16 x float> %b) nounwind {
 ; CHECK-LABEL: transform_VUNPCKLPSZrr:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-fast-per-lane.ll b/llvm/test/CodeGen/X86/vector-shuffle-fast-per-lane.ll
index e59532d4fef30..4021b1bf292bb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-fast-per-lane.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-fast-per-lane.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=znver2 | FileCheck %s --check-prefixes=FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=znver3 | FileCheck %s --check-prefixes=FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell | FileCheck %s --check-prefixes=FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s --check-prefixes=FAST
 
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
index e6a07b4aeb271..3c1eb92e9e3c3 100644
--- a/llvm/test/CodeGen/X86/vpdpwssd.ll
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s
 
 define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll
index af6fbdc9f60de..bbaa414924707 100644
--- a/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll
+++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll
@@ -16,6 +16,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s
 
 ; Verify that for the X86_64 processors that are known to have poor latency
 ; double precision shift instructions we do not generate 'shld' or 'shrd'
diff --git a/llvm/test/MC/X86/x86_long_nop.s b/llvm/test/MC/X86/x86_long_nop.s
index 6136c3db9a3da..b79403bb5f1ec 100644
--- a/llvm/test/MC/X86/x86_long_nop.s
+++ b/llvm/test/MC/X86/x86_long_nop.s
@@ -19,6 +19,8 @@
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver3 | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=znver4 %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver4 | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=znver5 %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver5 | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=nehalem %s | llvm-objdump -d --no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=westmere %s | llvm-objdump -d --no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=sandybridge %s | llvm-objdump -d --no-show-raw-insn - | FileCheck --check-prefix=LNOP15 %s
diff --git a/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll b/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll
index abdcfcf7e0742..b05994ddfa35e 100644
--- a/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll
+++ b/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll
@@ -1,6 +1,7 @@
 ; RUN: opt -passes=debugify,loop-unroll -mcpu=znver3 -pass-remarks=loop-unroll -pass-remarks-analysis=loop-unroll < %s -S 2>&1 | FileCheck --check-prefixes=ALL,UNROLL %s
 ; RUN: opt -passes=debugify,loop-unroll -mcpu=znver3 -pass-remarks=TTI -pass-remarks-analysis=TTI  < %s -S 2>&1 | FileCheck --check-prefixes=ALL,TTI %s
 ; RUN: opt -passes=debugify,loop-unroll -mcpu=znver4 -pass-remarks=loop-unroll -pass-remarks-analysis=loop-unroll < %s -S 2>&1 | FileCheck --check-prefixes=ALL,UNROLL %s
+; RUN: opt -passes=debugify,loop-unroll -mcpu=znver5 -pass-remarks=loop-unroll -pass-remarks-analysis=loop-unroll < %s -S 2>&1 | FileCheck --check-prefixes=ALL,UNROLL %s
 
 ; RUN: opt -passes=debugify,loop-unroll -mcpu=znver3 -pass-remarks=loop-unroll -pass-remarks-analysis=loop-unroll < %s -S 2>&1 --try-experimental-debuginfo-iterators | FileCheck --check-prefixes=ALL,UNROLL %s
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr63668.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr63668.ll
index 391771e06cab8..037e073de9d59 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr63668.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr63668.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=znver4 -S < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=znver5 -S < %s | FileCheck %s
 
 define internal i32 @testfunc() {
 ; CHECK-LABEL: define internal i32 @testfunc
diff --git a/llvm/tools/llvm-exegesis/lib/LlvmState.h b/llvm/tools/llvm-exegesis/lib/LlvmState.h
index e42393edb636d..f69d76c9a1e4e 100644
--- a/llvm/tools/llvm-exegesis/lib/LlvmState.h
+++ b/llvm/tools/llvm-exegesis/lib/LlvmState.h
@@ -76,14 +76,6 @@ class LLVMState {
     return *OpcodeNameToOpcodeIdxMapping;
   };
 
-  // TODO(boomanaiden154): We are keeping this getter around to enable internal
-  // migration to getRegisterNumberFromName. Once that is complete and
-  // the changes have been pulled, we can remove this.
-  const DenseMap<StringRef, MCRegister> &getRegNameToRegNoMapping() const {
-    assert(RegNameToRegNoMapping);
-    return *RegNameToRegNoMapping;
-  }
-
   std::optional<MCRegister>
   getRegisterNumberFromName(StringRef RegisterName) const;
 
diff --git a/llvm/unittests/Bitcode/BitReaderTest.cpp b/llvm/unittests/Bitcode/BitReaderTest.cpp
index 22cc5e7492803..aea66fc1d8db5 100644
--- a/llvm/unittests/Bitcode/BitReaderTest.cpp
+++ b/llvm/unittests/Bitcode/BitReaderTest.cpp
@@ -38,7 +38,7 @@ std::unique_ptr<Module> parseAssembly(LLVMContext &Context,
 
   // A failure here means that the test itself is buggy.
   if (!M)
-    report_fatal_error(OS.str().c_str());
+    report_fatal_error(ErrMsg.c_str());
 
   return M;
 }
diff --git a/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp b/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp
index db9fb3a2d316e..b0dbd4a10b0a7 100644
--- a/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp
@@ -14,7 +14,7 @@ operator<<(std::ostream &OS, const LLT Ty) {
   std::string Repr;
   raw_string_ostream SS{Repr};
   Ty.print(SS);
-  OS << SS.str();
+  OS << Repr;
   return OS;
 }
 
@@ -23,7 +23,7 @@ operator<<(std::ostream &OS, const MachineFunction &MF) {
   std::string Repr;
   raw_string_ostream SS{Repr};
   MF.print(SS);
-  OS << SS.str();
+  OS << Repr;
   return OS;
 }
 
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp
index 401d04954a669..625e2c92b1119 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp
@@ -27,7 +27,7 @@ ::testing::AssertionResult isNullMIPtr(const MachineInstr *MI) {
   MI->print(MISStream, /*IsStandalone=*/true, /*SkipOpers=*/false,
             /*SkipDebugLoc=*/false, /*AddNewLine=*/false);
   return ::testing::AssertionFailure()
-         << "unable to legalize instruction: " << MISStream.str();
+         << "unable to legalize instruction: " << MIBuffer;
 }
 
 DefineLegalizerInfo(ALegalizer, {
diff --git a/llvm/unittests/CodeGen/MachineInstrTest.cpp b/llvm/unittests/CodeGen/MachineInstrTest.cpp
index af25acbb38fd5..d1546cf96f8d7 100644
--- a/llvm/unittests/CodeGen/MachineInstrTest.cpp
+++ b/llvm/unittests/CodeGen/MachineInstrTest.cpp
@@ -223,9 +223,8 @@ TEST(MachineInstrPrintingTest, DebugLocPrinting) {
   raw_string_ostream OS(str);
   MI->print(OS, /*IsStandalone*/true, /*SkipOpers*/false, /*SkipDebugLoc*/false,
             /*AddNewLine*/false);
-  ASSERT_TRUE(
-      StringRef(OS.str()).starts_with("$noreg = UNKNOWN debug-location "));
-  ASSERT_TRUE(StringRef(OS.str()).ends_with("filename:1:5"));
+  ASSERT_TRUE(StringRef(str).starts_with("$noreg = UNKNOWN debug-location "));
+  ASSERT_TRUE(StringRef(str).ends_with("filename:1:5"));
 }
 
 TEST(MachineInstrSpan, DistanceBegin) {
diff --git a/llvm/unittests/CodeGen/MachineOperandTest.cpp b/llvm/unittests/CodeGen/MachineOperandTest.cpp
index 8465c8b4f5394..63059d3267f71 100644
--- a/llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ b/llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -73,7 +73,7 @@ TEST(MachineOperandTest, PrintRegisterMask) {
   std::string str;
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "<regmask ...>");
+  ASSERT_TRUE(str == "<regmask ...>");
 }
 
 TEST(MachineOperandTest, PrintSubReg) {
@@ -94,7 +94,7 @@ TEST(MachineOperandTest, PrintSubReg) {
   std::string str;
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "$physreg1.subreg5");
+  ASSERT_TRUE(str == "$physreg1.subreg5");
 }
 
 TEST(MachineOperandTest, PrintCImm) {
@@ -116,7 +116,7 @@ TEST(MachineOperandTest, PrintCImm) {
   std::string str;
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "i128 18446744073709551616");
+  ASSERT_TRUE(str == "i128 18446744073709551616");
 }
 
 TEST(MachineOperandTest, PrintSubRegIndex) {
@@ -133,7 +133,7 @@ TEST(MachineOperandTest, PrintSubRegIndex) {
   std::string str;
   raw_string_ostream OS(str);
   MachineOperand::printSubRegIdx(OS, MO.getImm(), nullptr);
-  ASSERT_TRUE(OS.str() == "%subreg.3");
+  ASSERT_TRUE(str == "%subreg.3");
 }
 
 TEST(MachineOperandTest, PrintCPI) {
@@ -152,7 +152,7 @@ TEST(MachineOperandTest, PrintCPI) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "%const.0 + 8");
+    ASSERT_TRUE(str == "%const.0 + 8");
   }
 
   str.clear();
@@ -164,7 +164,7 @@ TEST(MachineOperandTest, PrintCPI) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "%const.0 - 12");
+    ASSERT_TRUE(str == "%const.0 - 12");
   }
 }
 
@@ -183,7 +183,7 @@ TEST(MachineOperandTest, PrintTargetIndexName) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "target-index(<unknown>) + 8");
+    ASSERT_TRUE(str == "target-index(<unknown>) + 8");
   }
 
   str.clear();
@@ -194,7 +194,7 @@ TEST(MachineOperandTest, PrintTargetIndexName) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "target-index(<unknown>) - 12");
+    ASSERT_TRUE(str == "target-index(<unknown>) - 12");
   }
 }
 
@@ -211,7 +211,7 @@ TEST(MachineOperandTest, PrintJumpTableIndex) {
   std::string str;
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "%jump-table.3");
+  ASSERT_TRUE(str == "%jump-table.3");
 }
 
 TEST(MachineOperandTest, PrintExternalSymbol) {
@@ -228,7 +228,7 @@ TEST(MachineOperandTest, PrintExternalSymbol) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "&foo");
+    ASSERT_TRUE(str == "&foo");
   }
 
   str.clear();
@@ -238,7 +238,7 @@ TEST(MachineOperandTest, PrintExternalSymbol) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "&foo + 12");
+    ASSERT_TRUE(str == "&foo + 12");
   }
 
   str.clear();
@@ -248,7 +248,7 @@ TEST(MachineOperandTest, PrintExternalSymbol) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "&foo - 12");
+    ASSERT_TRUE(str == "&foo - 12");
   }
 }
 
@@ -274,7 +274,7 @@ TEST(MachineOperandTest, PrintGlobalAddress) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "@foo + 12");
+    ASSERT_TRUE(str == "@foo + 12");
   }
 
   str.clear();
@@ -284,7 +284,7 @@ TEST(MachineOperandTest, PrintGlobalAddress) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "@foo - 12");
+    ASSERT_TRUE(str == "@foo - 12");
   }
 }
 
@@ -302,7 +302,7 @@ TEST(MachineOperandTest, PrintRegisterLiveOut) {
   // Print a MachineOperand containing a register live out list without a TRI.
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "liveout(<unknown>)");
+  ASSERT_TRUE(str == "liveout(<unknown>)");
 }
 
 TEST(MachineOperandTest, PrintMetadata) {
@@ -328,7 +328,7 @@ TEST(MachineOperandTest, PrintMetadata) {
   MO.print(OS, MST, LLT{}, /*OpIdx*/~0U, /*PrintDef=*/false, /*IsStandalone=*/false,
            /*ShouldPrintRegisterTies=*/false, 0, /*TRI=*/nullptr,
            /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "!0");
+  ASSERT_TRUE(str == "!0");
 }
 
 TEST(MachineOperandTest, PrintMCSymbol) {
@@ -349,7 +349,7 @@ TEST(MachineOperandTest, PrintMCSymbol) {
   // Print a MachineOperand containing a metadata node.
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "<mcsymbol foo>");
+  ASSERT_TRUE(str == "<mcsymbol foo>");
 }
 
 TEST(MachineOperandTest, PrintCFI) {
@@ -366,7 +366,7 @@ TEST(MachineOperandTest, PrintCFI) {
   // attached to it.
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "<cfi directive>");
+  ASSERT_TRUE(str == "<cfi directive>");
 }
 
 TEST(MachineOperandTest, PrintIntrinsicID) {
@@ -383,7 +383,7 @@ TEST(MachineOperandTest, PrintIntrinsicID) {
     // Print a MachineOperand containing a generic intrinsic ID.
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "intrinsic(@llvm.bswap)");
+    ASSERT_TRUE(str == "intrinsic(@llvm.bswap)");
   }
 
   str.clear();
@@ -394,7 +394,7 @@ TEST(MachineOperandTest, PrintIntrinsicID) {
     // IntrinsicInfo.
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "intrinsic(4294967295)");
+    ASSERT_TRUE(str == "intrinsic(4294967295)");
   }
 }
 
@@ -411,7 +411,7 @@ TEST(MachineOperandTest, PrintPredicate) {
   // Print a MachineOperand containing a int predicate ICMP_EQ.
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "intpred(eq)");
+  ASSERT_TRUE(str == "intpred(eq)");
 }
 
 TEST(MachineOperandTest, HashValue) {
diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp
index 3517f0e32b1bb..ed226d5765586 100644
--- a/llvm/unittests/SandboxIR/PassTest.cpp
+++ b/llvm/unittests/SandboxIR/PassTest.cpp
@@ -162,3 +162,34 @@ TEST_F(PassTest, PassRegistry) {
   EXPECT_EQ(Buff, "test-pass1\ntest-pass2\n");
 #endif // NDEBUG
 }
+
+TEST_F(PassTest, ParsePassPipeline) {
+  class TestPass1 final : public FunctionPass {
+  public:
+    TestPass1() : FunctionPass("test-pass1") {}
+    bool runOnFunction(Function &F) final { return false; }
+  };
+  class TestPass2 final : public FunctionPass {
+  public:
+    TestPass2() : FunctionPass("test-pass2") {}
+    bool runOnFunction(Function &F) final { return false; }
+  };
+
+  PassRegistry Registry;
+  Registry.registerPass(std::make_unique<TestPass1>());
+  Registry.registerPass(std::make_unique<TestPass2>());
+
+  auto &FPM =
+      Registry.parseAndCreatePassPipeline("test-pass1,test-pass2,test-pass1");
+#ifndef NDEBUG
+  std::string Buff;
+  llvm::raw_string_ostream SS(Buff);
+  FPM.print(SS);
+  EXPECT_EQ(Buff, "init-fpm(test-pass1,test-pass2,test-pass1)");
+#endif // NDEBUG
+
+  EXPECT_DEATH(Registry.parseAndCreatePassPipeline("bad-pass-name"),
+               ".*not registered.*");
+  EXPECT_DEATH(Registry.parseAndCreatePassPipeline(""), ".*not registered.*");
+  EXPECT_DEATH(Registry.parseAndCreatePassPipeline(","), ".*not registered.*");
+}
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index ad5508f041d6c..d883c185f8296 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -843,6 +843,30 @@ define void @foo(ptr %ptr) {
   EXPECT_EQ(LookupBB2Addr, nullptr);
 }
 
+TEST_F(SandboxIRTest, DSOLocalEquivalent) {
+  parseIR(C, R"IR(
+declare void @bar()
+define void @foo() {
+  call void dso_local_equivalent @bar()
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *CI = cast<sandboxir::CallInst>(&*It++);
+  // Check classof().
+  auto *DSOLE = cast<sandboxir::DSOLocalEquivalent>(CI->getCalledOperand());
+  // Check getGlobalValue().
+  auto *GV = DSOLE->getGlobalValue();
+  // Check get().
+  auto *NewDSOLE = sandboxir::DSOLocalEquivalent::get(GV);
+  EXPECT_EQ(NewDSOLE, DSOLE);
+}
+
 TEST_F(SandboxIRTest, ConstantTokenNone) {
   parseIR(C, R"IR(
 define void @foo(ptr %ptr) {
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 7bbf18fe0106f..152715f281088 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -123,52 +123,67 @@ class NVVM_SpecialRegisterOp<string mnemonic, list<Trait> traits = []> :
   let assemblyFormat = "attr-dict `:` type($res)";
 }
 
+class NVVM_SpecialRangeableRegisterOp<string mnemonic, list<Trait> traits = []> :
+  NVVM_SpecialRegisterOp<mnemonic, traits> {
+  let arguments = (ins OptionalAttr<LLVM_ConstantRangeAttr>:$range);
+  let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)";
+  let llvmBuilder = baseLlvmBuilder # setRangeRetAttrCode # baseLlvmBuilderCoda;
+  let mlirBuilder = baseMlirBuilder # importRangeRetAttrCode # baseMlirBuilderCoda;
+
+  // Backwards-compatibility builder for an unspecified range.
+  let builders = [
+    OpBuilder<(ins "Type":$resultType), [{
+      build($_builder, $_state, resultType, ::mlir::LLVM::ConstantRangeAttr{});
+    }]>
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // Lane index and range
-def NVVM_LaneIdOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.laneid">;
-def NVVM_WarpSizeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.warpsize">;
+def NVVM_LaneIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.laneid">;
+def NVVM_WarpSizeOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.warpsize">;
 
 //===----------------------------------------------------------------------===//
 // Thread index and range
-def NVVM_ThreadIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.x">;
-def NVVM_ThreadIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.y">;
-def NVVM_ThreadIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.z">;
-def NVVM_BlockDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.x">;
-def NVVM_BlockDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.y">;
-def NVVM_BlockDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.z">;
+def NVVM_ThreadIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.x">;
+def NVVM_ThreadIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.y">;
+def NVVM_ThreadIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.z">;
+def NVVM_BlockDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.x">;
+def NVVM_BlockDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.y">;
+def NVVM_BlockDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.z">;
 
 //===----------------------------------------------------------------------===//
 // Block index and range
-def NVVM_BlockIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.x">;
-def NVVM_BlockIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.y">;
-def NVVM_BlockIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.z">;
-def NVVM_GridDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.x">;
-def NVVM_GridDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.y">;
-def NVVM_GridDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.z">;
+def NVVM_BlockIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.x">;
+def NVVM_BlockIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.y">;
+def NVVM_BlockIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.z">;
+def NVVM_GridDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.x">;
+def NVVM_GridDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.y">;
+def NVVM_GridDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.z">;
 
 //===----------------------------------------------------------------------===//
 // CTA Cluster index and range
-def NVVM_ClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.x">;
-def NVVM_ClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.y">;
-def NVVM_ClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.z">;
-def NVVM_ClusterDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.x">;
-def NVVM_ClusterDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.y">;
-def NVVM_ClusterDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.z">;
+def NVVM_ClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.x">;
+def NVVM_ClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.y">;
+def NVVM_ClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.z">;
+def NVVM_ClusterDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.x">;
+def NVVM_ClusterDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.y">;
+def NVVM_ClusterDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.z">;
 
 
 //===----------------------------------------------------------------------===//
 // CTA index and range within Cluster
-def NVVM_BlockInClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.x">;
-def NVVM_BlockInClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.y">;
-def NVVM_BlockInClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.z">;
-def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
-def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
-def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
+def NVVM_BlockInClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.x">;
+def NVVM_BlockInClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.y">;
+def NVVM_BlockInClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.z">;
+def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
+def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
+def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
 
 //===----------------------------------------------------------------------===//
 // CTA index and across Cluster dimensions
-def NVVM_ClusterId : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctarank">;
-def NVVM_ClusterDim : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctarank">;
+def NVVM_ClusterId : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank">;
+def NVVM_ClusterDim : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">;
 
 //===----------------------------------------------------------------------===//
 // Clock registers
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 9b1be198f77a8..164622d77e6b6 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -29,6 +29,7 @@
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -209,7 +210,15 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = op->getLoc();
     MLIRContext *context = rewriter.getContext();
-    Value newOp = rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type());
+    LLVM::ConstantRangeAttr bounds = nullptr;
+    if (std::optional<APInt> upperBound = op.getUpperBound())
+      bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
+          /*bitWidth=*/32, /*lower=*/0, upperBound->getZExtValue());
+    else
+      bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
+          /*bitWidth=*/32, /*lower=*/0, /*upper=*/kWarpSize);
+    Value newOp =
+        rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type(), bounds);
     // Truncate or extend the result depending on the index bitwidth specified
     // by the LLVMTypeConverter options.
     const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
@@ -340,27 +349,40 @@ void mlir::populateGpuSubgroupReduceOpLoweringPattern(
 
 void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
                                                RewritePatternSet &patterns) {
+  using gpu::index_lowering::IndexKind;
+  using gpu::index_lowering::IntrType;
   populateWithGenerated(patterns);
   patterns.add<GPUPrintfOpToVPrintfLowering>(converter);
   patterns.add<
       gpu::index_lowering::OpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
-                                      NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
+                                      NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>>(
+      converter, IndexKind::Block, IntrType::Id);
+  patterns.add<
       gpu::index_lowering::OpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
-                                      NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
+                                      NVVM::BlockDimYOp, NVVM::BlockDimZOp>>(
+      converter, IndexKind::Block, IntrType::Dim);
+  patterns.add<
       gpu::index_lowering::OpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
-                                      NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
-      gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
-                                      NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
-      gpu::index_lowering::OpLowering<
-          gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
-          NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>,
-      gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
-                                      NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
-      gpu::index_lowering::OpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
-                                      NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
-      gpu::index_lowering::OpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
-                                      NVVM::GridDimYOp, NVVM::GridDimZOp>,
-      GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(converter);
+                                      NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>>(
+      converter, IndexKind::Other, IntrType::Id);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp,
+      NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
+      NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>(
+      converter, IndexKind::Other, IntrType::Id);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp,
+      NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>(
+      converter, IndexKind::Block, IntrType::Id);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::GridDimOp, NVVM::GridDimXOp, NVVM::GridDimYOp, NVVM::GridDimZOp>>(
+      converter, IndexKind::Grid, IntrType::Dim);
+  patterns.add<GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(
+      converter);
 
   patterns.add<GPUDynamicSharedMemoryOpLowering>(
       converter, NVVM::kSharedMemoryAlignmentBit);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
index b65b18699a15a..80edf4a32c6df 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
@@ -390,6 +390,8 @@ Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
     TransformMapKeyTy key = {m, r};
     int64_t retRows = 1;
     Value matmulRetValue = extractFilter;
+    Value zero = builder.create<arith::ConstantOp>(
+        loc, rewriter.getZeroAttr(elementType));
     if (leftTransform) {
       // Get constant transform matrix G.
       auto it = GMatrices.find(key);
@@ -399,8 +401,11 @@ Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
 
       retRows = GMatrix.rows;
       auto matmulType = RankedTensorType::get({retRows, filterW}, elementType);
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
       Value G = create2DTransformMatrix(builder, loc, GMatrix, elementType);
       // Multiply G x g.
@@ -418,8 +423,11 @@ Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
 
       auto matmulType =
           RankedTensorType::get({retRows, GTMatrix.cols}, elementType);
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
       Value GT = create2DTransformMatrix(builder, loc, GTMatrix, elementType);
       // Multiply u = (G x g) x GT.
@@ -523,6 +531,8 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input,
     int64_t retRows = 1;
     int64_t retCols = 1;
     Value matmulRetValue = extractInput;
+    Value zero = builder.create<arith::ConstantOp>(
+        loc, rewriter.getZeroAttr(elementType));
     if (leftTransform) {
       // Get constant transform matrix BT.
       auto it = BTMatrices.find(key);
@@ -532,8 +542,11 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input,
 
       retRows = BTMatrix.rows;
       auto matmulType = RankedTensorType::get({retRows, alphaW}, elementType);
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
       Value BT =
           create2DTransformMatrix(builder, loc, BTMatrix, builder.getF32Type());
@@ -552,8 +565,11 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input,
 
       retCols = BMatrix.cols;
       auto matmulType = RankedTensorType::get({retRows, retCols}, elementType);
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
       Value B =
           create2DTransformMatrix(builder, loc, BMatrix, builder.getF32Type());
       // Multiply v = (BT x d) x B.
@@ -636,8 +652,13 @@ static Value matrixMultiply(RewriterBase &rewriter, Location loc,
       {inputShape[0] * inputShape[1],
        inputShape[2] * inputShape[3] * inputShape[4], filterShape[3]},
       outputElementType);
-  Value init = rewriter.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                outputElementType);
+  Value empty = rewriter
+                    .create<tensor::EmptyOp>(loc, matmulType.getShape(),
+                                             outputElementType)
+                    .getResult();
+  Value zero = rewriter.create<arith::ConstantOp>(
+      loc, rewriter.getZeroAttr(outputElementType));
+  Value init = rewriter.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
   auto matmulOp = rewriter.create<linalg::BatchMatmulOp>(
       loc, matmulType, ValueRange({collapseInput, collapseFilter}),
@@ -725,6 +746,8 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
     int64_t leftScalarFactor = 1;
     int64_t rightScalarFactor = 1;
     Value matmulRetValue = extractValue;
+    Value zero = builder.create<arith::ConstantOp>(
+        loc, rewriter.getZeroAttr(elementType));
     if (leftTransform) {
       // Get constant transform matrix AT.
       auto it = ATMatrices.find(key);
@@ -735,8 +758,11 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
       leftScalarFactor = ATMatrix.scalarFactor;
       retRows = ATMatrix.rows;
       auto matmulType = RankedTensorType::get({retRows, valueW}, elementType);
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
       Value AT = create2DTransformMatrix(builder, loc, ATMatrix, elementType);
       // Multiply AT x m.
@@ -756,8 +782,11 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
       auto matmulType =
           RankedTensorType::get({retRows, AMatrix.cols}, elementType);
       retCols = AMatrix.cols;
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
       Value A = create2DTransformMatrix(builder, loc, AMatrix, elementType);
       // Multiply y = (AT x m) x A.
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
index 855abc12a909e..bc830a77f3c58 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Target/LLVMIR/ModuleImport.h"
 
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 
 using namespace mlir;
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 8f2ec289c9252..66ad1e307fc3a 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -50,7 +50,7 @@ gpu.module @test_module_0 {
     %gDimZ = gpu.grid_dim z
 
 
-    // CHECK: = nvvm.read.ptx.sreg.laneid : i32
+    // CHECK: = nvvm.read.ptx.sreg.laneid range <i32, 0, 32> : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
     %laneId = gpu.lane_id
 
@@ -699,9 +699,21 @@ gpu.module @test_module_32 {
 }
 
 gpu.module @test_module_33 {
-// CHECK-LABEL: func @kernel_with_block_size()
-// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 128, 1, 1>}
-  gpu.func @kernel_with_block_size() kernel attributes {known_block_size = array<i32: 128, 1, 1>} {
+// CHECK-LABEL: func @kernel_with_block_size(
+// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 32, 4, 2>, nvvm.kernel, nvvm.maxntid = array<i32: 32, 4, 2>}
+  gpu.func @kernel_with_block_size(%arg0: !llvm.ptr) kernel attributes {known_block_size = array<i32: 32, 4, 2>} {
+    // CHECK: = nvvm.read.ptx.sreg.tid.x range <i32, 0, 32> : i32
+    %0 = gpu.thread_id x
+    // CHECK: = nvvm.read.ptx.sreg.tid.y range <i32, 0, 4> : i32
+    %1 = gpu.thread_id y
+    // CHECK: = nvvm.read.ptx.sreg.tid.z range <i32, 0, 2> : i32
+    %2 = gpu.thread_id z
+
+    // Fake usage to prevent dead code elimination
+    %3 = arith.addi %0, %1 : index
+    %4 = arith.addi %3, %2 : index
+    %5 = arith.index_cast %4 : index to i64
+    llvm.store %5, %arg0 : i64, !llvm.ptr
     gpu.return
   }
 }
@@ -917,6 +929,20 @@ gpu.module @test_module_48 {
   }
 }
 
+gpu.module @test_module_49 {
+// CHECK-LABEL: func @explicit_id_bounds()
+  func.func @explicit_id_bounds() -> (index, index, index) {
+    // CHECK: = nvvm.read.ptx.sreg.tid.x range <i32, 0, 32> : i32
+    %0 = gpu.thread_id x upper_bound 32
+    // CHECK: = nvvm.read.ptx.sreg.ntid.x range <i32, 1, 33> : i32
+    %1 = gpu.block_dim x upper_bound 32
+    // CHECK: = nvvm.read.ptx.sreg.laneid range <i32, 0, 16> : i32
+    %2 = gpu.lane_id upper_bound 16
+
+    return %0, %1, %2 : index, index, index
+  }
+}
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
     %gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module
diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
index 6bb3fb1423edc..c5760acf94a88 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
@@ -36,6 +36,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: func.func @conv2d
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> {
 // CHECK:  %[[CST:.*]] = arith.constant 1.024000e+03 : f32
+// CHECK:  %[[CST_0:.*]] = arith.constant dense<{{.*}}> : tensor<6x4xf32>
+// CHECK:  %[[CST_1:.*]] = arith.constant dense<{{.*}}> : tensor<4x6xf32>
+// CHECK:  %[[CST_2:.*]] = arith.constant dense<{{.*}}> : tensor<6x6xf32>
+// CHECK:  %[[CST_3:.*]] = arith.constant dense<{{.*}}> : tensor<6x6xf32>
+// CHECK:  %[[CST_4:.*]] = arith.constant dense<{{.*}}> : tensor<3x6xf32>
+// CHECK:  %[[CST_5:.*]] = arith.constant dense<{{.*}}> : tensor<6x3xf32>
+// CHECK:  %[[CST_6:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:  %[[C1:.*]] = arith.constant 1 : index
 // CHECK:  %[[C5:.*]] = arith.constant 5 : index
 // CHECK:  %[[C2:.*]] = arith.constant 2 : index
@@ -44,9 +51,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:  %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]])
 // CHECK:    %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]])
 // CHECK:      %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 3, 3, 1] [1, 1, 1, 1]
-// CHECK:      %[[S11:.*]] = linalg.matmul
-// CHECK:      %[[S13:.*]] = linalg.matmul
-// CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 6, 1, 1] [1, 1, 1, 1]
+// CHECK:      %[[S10:.*]] = tensor.empty() : tensor<6x3xf32>
+// CHECK:      %[[S11:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S10]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK:      %[[S12:.*]] = linalg.matmul ins(%[[CST_5]], %[[EXTRACTED_SLICE]] : tensor<6x3xf32>, tensor<3x3xf32>) outs(%[[S11]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK:      %[[S13:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:      %[[S14:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S13]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:      %[[S15:.*]] = linalg.matmul ins(%[[S12]], %[[CST_4]] : tensor<6x3xf32>, tensor<3x6xf32>) outs(%[[S14]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S15]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 6, 1, 1] [1, 1, 1, 1]
 // CHECK:      scf.yield %[[INSERTED_SLICE]]
 // CHECK:    scf.yield %[[S9]]
 // CHECK:  %[[S2:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32>
@@ -60,9 +71,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:      %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]])
 // CHECK:        %[[S13:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]])
 // CHECK:          %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 6, 6, 1] [1, 1, 1, 1]
-// CHECK:          %[[S15:.*]] = linalg.matmul
-// CHECK:          %[[S17:.*]] = linalg.matmul
-// CHECK:          %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S17]] into %[[ARG10]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
+// CHECK:          %[[S14:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:          %[[S15:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S14]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S16:.*]] = linalg.matmul ins(%[[CST_3]], %[[EXTRACTED_SLICE_8]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S15]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S17:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:          %[[S18:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S17]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S19:.*]] = linalg.matmul ins(%[[S16]], %[[CST_2]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S18]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S19]] into %[[ARG10]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
 // CHECK:          scf.yield %[[INSERTED_SLICE_9]]
 // CHECK:        scf.yield %[[S13]]
 // CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1]
@@ -82,15 +97,19 @@ module attributes {transform.with_named_sequence} {
 // CHECK:      %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]])
 // CHECK:        %[[S15:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]])
 // CHECK:          %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
-// CHECK:          %[[S17:.*]] = linalg.matmul
-// CHECK:          %[[S19:.*]] = linalg.matmul
-// CHECK:          %[[S20:.*]] = tensor.empty()
-// CHECK:          %[[S21:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S20]] : tensor<4x4xf32>) {
+// CHECK:          %[[S16:.*]] = tensor.empty() : tensor<4x6xf32>
+// CHECK:          %[[S17:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S16]] : tensor<4x6xf32>) -> tensor<4x6xf32>
+// CHECK:          %[[S18:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_8]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S17]] : tensor<4x6xf32>) -> tensor<4x6xf32>
+// CHECK:          %[[S19:.*]] = tensor.empty() : tensor<4x4xf32>
+// CHECK:          %[[S20:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S19]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[S21:.*]] = linalg.matmul ins(%[[S18]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[S22:.*]] = tensor.empty() : tensor<4x4xf32>
+// CHECK:          %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S22]] : tensor<4x4xf32>) {
 // CHECK:          ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
 // CHECK:            linalg.yield %[[IN]] : f32
 // CHECK:          } -> tensor<4x4xf32>
-// CHECK:          %[[S22:.*]] = linalg.mul ins(%[[S21]], %[[S19]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK:          %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S22]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1]
+// CHECK:          %[[S24:.*]] = linalg.mul ins(%[[S23]], %[[S21]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S22]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S24]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1]
 // CHECK:          scf.yield %[[INSERTED_SLICE_9]]
 // CHECK:        scf.yield %[[S15]]
 // CHECK:      %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
@@ -114,14 +133,15 @@ func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5x
   %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32>
   %4 = tensor.empty() : tensor<36x18x2xf32>
-  %5 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%4 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
-  %expanded = tensor.expand_shape %5 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
+  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+  %6 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%5 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+  %expanded = tensor.expand_shape %6 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
   %padded_1 = tensor.pad %arg2 low[0, 0, 0, 0] high[0, 3, 3, 0] {
   ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):
     tensor.yield %cst : f32
   } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32>
-  %6 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
-  %extracted_slice = tensor.extract_slice %6[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
+  %7 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
+  %extracted_slice = tensor.extract_slice %7[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
   return %extracted_slice : tensor<2x9x9x2xf32>
 }
 
@@ -147,18 +167,29 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: func.func @conv2d_unaligned
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x11x11x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> {
 // CHECK:  %[[CST:.*]] = arith.constant 1.024000e+03 : f32
+// CHECK:  %[[CST_0:.*]] = arith.constant dense<{{.*}}> : tensor<6x4xf32>
+// CHECK:  %[[CST_1:.*]] = arith.constant dense<{{.*}}> : tensor<4x6xf32>
+// CHECK:  %[[CST_2:.*]] = arith.constant dense<{{.*}}> : tensor<6x6xf32>
+// CHECK:  %[[CST_3:.*]] = arith.constant dense<{{.*}}> : tensor<6x6xf32>
 // CHECK:  %[[C3:.*]] = arith.constant 3 : index
+// CHECK:  %[[CST_4:.*]] = arith.constant dense<{{.*}}> : tensor<3x6xf32>
+// CHECK:  %[[CST_5:.*]] = arith.constant dense<{{.*}}> : tensor<6x3xf32>
 // CHECK:  %[[C1:.*]] = arith.constant 1 : index
 // CHECK:  %[[C5:.*]] = arith.constant 5 : index
 // CHECK:  %[[C2:.*]] = arith.constant 2 : index
 // CHECK:  %[[C0:.*]] = arith.constant 0 : index
+// CHECK:  %[[CST_6:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:  %[[S0:.*]] = tensor.empty()
 // CHECK:  %[[S1:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S0]])
 // CHECK:    %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]])
 // CHECK:      %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG4]], 0, 0, %[[ARG6]]] [1, 3, 3, 1] [1, 1, 1, 1]
-// CHECK:      %[[S11:.*]] = linalg.matmul
-// CHECK:      %[[S13:.*]] = linalg.matmul
-// CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[ARG7]][0, 0, %[[ARG6]], %[[ARG4]]] [6, 6, 1, 1] [1, 1, 1, 1]
+// CHECK:      %[[S11:.*]] = tensor.empty() : tensor<6x3xf32>
+// CHECK:      %[[S12:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S11]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK:      %[[S13:.*]] = linalg.matmul ins(%[[CST_5]], %[[EXTRACTED_SLICE_9]] : tensor<6x3xf32>, tensor<3x3xf32>) outs(%[[S12]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK:      %[[S14:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:      %[[S15:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S14]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:      %[[S16:.*]] = linalg.matmul ins(%[[S13]], %[[CST_4]] : tensor<6x3xf32>, tensor<3x6xf32>) outs(%[[S15]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S16]] into %[[ARG7]][0, 0, %[[ARG6]], %[[ARG4]]] [6, 6, 1, 1] [1, 1, 1, 1]
 // CHECK:      scf.yield %[[INSERTED_SLICE]] : tensor<6x6x5x2xf32>
 // CHECK:    scf.yield %[[S9]] : tensor<6x6x5x2xf32>
 // CHECK:  %[[PADDED:.*]] = tensor.pad %[[ARG0]] low[0, 0, 0, 0] high[0, 3, 3, 0]
@@ -173,9 +204,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:      %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]])
 // CHECK:        %[[S13:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]])
 // CHECK:          %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 6, 6, 1] [1, 1, 1, 1]
-// CHECK:          %[[S15:.*]] = linalg.matmul
-// CHECK:          %[[S17:.*]] = linalg.matmul
-// CHECK:          %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S17]] into %[[ARG11]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
+// CHECK:          %[[S15:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:          %[[S16:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S15]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S17:.*]] = linalg.matmul ins(%[[CST_3]], %[[EXTRACTED_SLICE_11]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S16]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S18:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:          %[[S19:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S18]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S20:.*]] = linalg.matmul ins(%[[S17]], %[[CST_2]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S19]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S20]] into %[[ARG11]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
 // CHECK:          scf.yield %[[INSERTED_SLICE_12]] : tensor<6x6x1x1x2x5xf32>
 // CHECK:        scf.yield %[[S13]] : tensor<6x6x1x1x2x5xf32>
 // CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1]
@@ -196,15 +231,19 @@ module attributes {transform.with_named_sequence} {
 // CHECK:      %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]])
 // CHECK:        %[[S15:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]])
 // CHECK:          %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
-// CHECK:          %[[S17:.*]] = linalg.matmul
-// CHECK:          %[[S19:.*]] = linalg.matmul
+// CHECK:          %[[S17:.*]] = tensor.empty() : tensor<4x6xf32>
+// CHECK:          %[[S18:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S17]] : tensor<4x6xf32>) -> tensor<4x6xf32>
+// CHECK:          %[[S19:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_11]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S18]] : tensor<4x6xf32>) -> tensor<4x6xf32>
 // CHECK:          %[[S20:.*]] = tensor.empty() : tensor<4x4xf32>
-// CHECK:          %[[S21:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S20]] : tensor<4x4xf32>) {
+// CHECK:          %[[S21:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[S22:.*]] = linalg.matmul ins(%[[S19]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S21]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[S23:.*]] = tensor.empty() : tensor<4x4xf32>
+// CHECK:          %[[S24:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S23]] : tensor<4x4xf32>) {
 // CHECK:          ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
 // CHECK:            linalg.yield %[[IN]] : f32
 // CHECK:          } -> tensor<4x4xf32>
-// CHECK:          %[[S22:.*]] = linalg.mul ins(%[[S21]], %[[S19]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK:          %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S22]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1]
+// CHECK:          %[[S25:.*]] = linalg.mul ins(%[[S24]], %[[S22]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S23]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S25]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1]
 // CHECK:          scf.yield %[[INSERTED_SLICE_12]]
 // CHECK:        scf.yield %[[S15]] : tensor<2x4x4x2xf32>
 // CHECK:      %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG4]])
@@ -218,6 +257,7 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 func.func @conv2d_mx1_rx1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>, %arg2: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<6x1x5x2xf32>
   %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x1x5xf32>) outs(%0 : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
   %2 = tensor.empty() : tensor<6x1x1x1x2x5xf32>
@@ -225,10 +265,11 @@ func.func @conv2d_mx1_rx1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>
   %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x1x5x2xf32> into tensor<6x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x1x1x1x2x5xf32> into tensor<6x2x5xf32>
   %4 = tensor.empty() : tensor<6x2x2xf32>
-  %5 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%4 : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
-  %expanded = tensor.expand_shape %5 [[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32>
-  %6 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x1x1x1x2x2xf32>) outs(%arg2 : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
-  return %6 : tensor<2x4x1x2xf32>
+  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+  %6 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%5 : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+  %expanded = tensor.expand_shape %6 [[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32>
+  %7 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x1x1x1x2x2xf32>) outs(%arg2 : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
+  return %7 : tensor<2x4x1x2xf32>
 }
 
 module attributes {transform.with_named_sequence} {
@@ -252,41 +293,53 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: func.func @conv2d_mx1_rx1
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x1x5xf32>, %[[ARG1:.*]]: tensor<2x3x1x5xf32>, %[[ARG2:.*]]: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> {
 // CHECK:   %[[CST:.*]] = arith.constant 3.200000e+01 : f32
+// CHECK:  %[[CST_0:.*]] = arith.constant dense<{{.*}}> : tensor<4x6xf32>
+// CHECK:  %[[CST_1:.*]] = arith.constant dense<{{.*}}> : tensor<6x6xf32>
+// CHECK:  %[[CST_2:.*]] = arith.constant dense<{{.*}}> : tensor<6x3xf32>
 // CHECK:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK:   %[[C5:.*]] = arith.constant 5 : index
 // CHECK:   %[[C2:.*]] = arith.constant 2 : index
 // CHECK:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK:   %[[CST_3:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:   %[[S0:.*]] = tensor.empty() : tensor<6x1x5x2xf32>
 // CHECK:   %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]])
 // CHECK:     %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]])
 // CHECK:       %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 3, 1, 1] [1, 1, 1, 1]
-// CHECK:       %[[S9:.*]] = linalg.matmul
-// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 1, 1, 1] [1, 1, 1, 1]
+// CHECK:       %[[S8:.*]] = tensor.empty() : tensor<6x1xf32>
+// CHECK:       %[[S9:.*]] = linalg.fill ins(%[[CST_3]] : f32) outs(%[[S8]] : tensor<6x1xf32>) -> tensor<6x1xf32>
+// CHECK:       %[[S10:.*]] = linalg.matmul ins(%[[CST_2]], %[[EXTRACTED_SLICE]] : tensor<6x3xf32>, tensor<3x1xf32>) outs(%[[S9]] : tensor<6x1xf32>) -> tensor<6x1xf32>
+// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S10]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 1, 1, 1] [1, 1, 1, 1]
 // CHECK:       scf.yield %[[INSERTED_SLICE]]
 // CHECK:     scf.yield %[[S7]]
 // CHECK:   %[[S2:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32>
 // CHECK:   %[[S3:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S2]])
 // CHECK:     %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]])
 // CHECK:       %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 6, 1, 1] [1, 1, 1, 1]
-// CHECK:       %[[S9:.*]] = linalg.matmul
-// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG6]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
+// CHECK:       %[[S8:.*]] = tensor.empty() : tensor<6x1xf32>
+// CHECK:       %[[S9:.*]] = linalg.fill ins(%[[CST_3]] : f32) outs(%[[S8]] : tensor<6x1xf32>) -> tensor<6x1xf32>
+// CHECK:       %[[S10:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE]] : tensor<6x6xf32>, tensor<6x1xf32>) outs(%[[S9]] : tensor<6x1xf32>) -> tensor<6x1xf32>
+// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S10]] into %[[ARG6]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
 // CHECK:       scf.yield %[[INSERTED_SLICE]]
 // CHECK:     scf.yield %[[S7]]
 // CHECK:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]]
 // CHECK:   %[[COLLAPSED_3:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2, 3, 4], [5]]
-// CHECK:   %[[S5:.*]] = linalg.batch_matmul
-// CHECK:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2]
+// CHECK:   %[[S4:.*]] = tensor.empty() : tensor<6x2x2xf32>
+// CHECK:   %[[S5:.*]] = linalg.fill ins(%[[CST_3]] : f32) outs(%[[S4]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK:   %[[S6:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_3]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S5]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2]
 // CHECK:   %[[S6:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[ARG2]])
 // CHECK:     %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]])
 // CHECK:       %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
-// CHECK:       %[[S9:.*]] = linalg.matmul
-// CHECK:       %[[S10:.*]] = tensor.empty() : tensor<4x1xf32>
-// CHECK:       %[[S11:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S10]] : tensor<4x1xf32>) {
+// CHECK:       %[[S9:.*]] = tensor.empty() : tensor<4x1xf32>
+// CHECK:       %[[S10:.*]] = linalg.fill ins(%[[CST_3]] : f32) outs(%[[S9]] : tensor<4x1xf32>) -> tensor<4x1xf32>
+// CHECK:       %[[S11:.*]] = linalg.matmul ins(%[[CST_0]], %[[EXTRACTED_SLICE]] : tensor<4x6xf32>, tensor<6x1xf32>) outs(%[[S10]] : tensor<4x1xf32>) -> tensor<4x1xf32>
+// CHECK:       %[[S12:.*]] = tensor.empty() : tensor<4x1xf32>
+// CHECK:       %[[S13:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S12]] : tensor<4x1xf32>) {
 // CHECK:       ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
 // CHECK:         linalg.yield %[[IN]] : f32
 // CHECK:       } -> tensor<4x1xf32>
-// CHECK:       %[[S12:.*]] = linalg.mul ins(%[[S11]], %[[S9]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S10]] : tensor<4x1xf32>) -> tensor<4x1xf32>
-// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1]
+// CHECK:       %[[S14:.*]] = linalg.mul ins(%[[S13]], %[[S11]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S12]] : tensor<4x1xf32>) -> tensor<4x1xf32>
+// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1]
 // CHECK:       scf.yield %[[INSERTED_SLICE]]
 // CHECK:     scf.yield %[[S7]]
 // CHECK:   return %[[S6]]
diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
index 095a6636b68dc..4369f5f1eab4c 100644
--- a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
+++ b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
@@ -13,14 +13,15 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
   %collapsed = tensor.collapse_shape %3 [[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %5 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32>
   %6 = tensor.empty() : tensor<36x18x2xf32>
-  %7 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%6 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
-  %expanded = tensor.expand_shape %7 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
+  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+  %8 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%7 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+  %expanded = tensor.expand_shape %8 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
   %padded_1 = tensor.pad %arg2 low[0, 0, 0, 0] high[0, 3, 3, 0] {
   ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
     tensor.yield %cst : f32
   } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32>
-  %8 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
-  %extracted_slice = tensor.extract_slice %8[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
+  %9 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
+  %extracted_slice = tensor.extract_slice %9[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
   return %extracted_slice : tensor<2x9x9x2xf32>
 }
 
@@ -46,11 +47,13 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
 // CHECK-NEXT:   %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]]) -> (tensor<6x6x5x2xf32>) {
 // CHECK-NEXT:     %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x6x5x2xf32>) {
 // CHECK-NEXT:       %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], %[[C0]], %[[C0]], %[[ARG5]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<3x3xf32>
-// CHECK-NEXT:       %[[S8:.*]] = tensor.empty() : tensor<6x3xf32>
-// CHECK-NEXT:       %[[S9:.*]] = linalg.matmul ins(%[[CST_5]], %[[EXTRACTED_SLICE_9]] : tensor<6x3xf32>, tensor<3x3xf32>) outs(%[[S8]] : tensor<6x3xf32>) -> tensor<6x3xf32>
-// CHECK-NEXT:       %[[S10:.*]] = tensor.empty() : tensor<6x6xf32>
-// CHECK-NEXT:       %[[S11:.*]] = linalg.matmul ins(%[[S9]], %[[CST_4]] : tensor<6x3xf32>, tensor<3x6xf32>) outs(%[[S10]] : tensor<6x6xf32>) -> tensor<6x6xf32>
-// CHECK-NEXT:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S11]] into %[[ARG6]][%[[C0]], %[[C0]], %[[ARG5]], %[[ARG3]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x5x2xf32>
+// CHECK-NEXT:       %[[S9:.*]] = tensor.empty() : tensor<6x3xf32>
+// CHECK-NEXT:       %[[S10:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S9]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK-NEXT:       %[[S11:.*]] = linalg.matmul ins(%[[CST_5]], %[[EXTRACTED_SLICE_9]] : tensor<6x3xf32>, tensor<3x3xf32>) outs(%[[S10]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK-NEXT:       %[[S12:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK-NEXT:       %[[S13:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S12]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:       %[[S14:.*]] = linalg.matmul ins(%[[S11]], %[[CST_4]] : tensor<6x3xf32>, tensor<3x6xf32>) outs(%[[S13]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG6]][%[[C0]], %[[C0]], %[[ARG5]], %[[ARG3]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x5x2xf32>
 // CHECK-NEXT:       scf.yield %[[INSERTED_SLICE]] : tensor<6x6x5x2xf32>
 // CHECK-NEXT:     }
 // CHECK-NEXT:     scf.yield %[[S7]] : tensor<6x6x5x2xf32>
@@ -67,11 +70,13 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
 // CHECK-NEXT:           %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
 // CHECK-NEXT:           %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]])
 // CHECK-NEXT:           %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[PADDED]][%[[ARG7]], %[[S10]], %[[S11]], %[[ARG9]]] [1, 6, 6, 1] [1, 1, 1, 1] : tensor<2x14x14x5xf32> to tensor<6x6xf32>
-// CHECK-NEXT:           %[[S12:.*]] = tensor.empty() : tensor<6x6xf32>
-// CHECK-NEXT:           %[[S13:.*]] = linalg.matmul ins(%[[CST_3]], %[[EXTRACTED_SLICE_9]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S12]] : tensor<6x6xf32>) -> tensor<6x6xf32>
-// CHECK-NEXT:           %[[S14:.*]] = tensor.empty() : tensor<6x6xf32>
-// CHECK-NEXT:           %[[S15:.*]] = linalg.matmul ins(%[[S13]], %[[CST_2]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S14]] : tensor<6x6xf32>) -> tensor<6x6xf32>
-// CHECK-NEXT:           %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S15]] into %[[ARG10]][0, 0, %[[ARG3]], %[[ARG5]], %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x3x3x2x5xf32>
+// CHECK-NEXT:           %[[S13:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK-NEXT:           %[[S14:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S13]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:           %[[S15:.*]] = linalg.matmul ins(%[[CST_3]], %[[EXTRACTED_SLICE_9]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S14]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:           %[[S16:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK-NEXT:           %[[S17:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S16]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:           %[[S18:.*]] = linalg.matmul ins(%[[S15]], %[[CST_2]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S17]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:           %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S18]] into %[[ARG10]][0, 0, %[[ARG3]], %[[ARG5]], %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x3x3x2x5xf32>
 // CHECK-NEXT:           scf.yield %[[INSERTED_SLICE]] : tensor<6x6x3x3x2x5xf32>
 // CHECK-NEXT:         }
 // CHECK-NEXT:         scf.yield %[[S9]] : tensor<6x6x3x3x2x5xf32>
@@ -83,8 +88,9 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:   %[[COLLAPSED_7:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<36x18x2xf32>
-// CHECK-NEXT:   %[[S5:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_7]], %[[COLLAPSED]] : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%[[S4]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
-// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
+// CHECK-NEXT:   %[[S5:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S4]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+// CHECK-NEXT:   %[[S6:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_7]], %[[COLLAPSED]] : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%[[S5]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
 // CHECK-NEXT:   %[[PADDED_8:.*]] = tensor.pad %[[ARG2]] low[0, 0, 0, 0] high[0, 3, 3, 0] {
 // CHECK-NEXT:   ^bb0(%[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index, %[[ARG6:.*]]: index):
 // CHECK-NEXT:     tensor.yield %[[CST_6]] : f32
@@ -94,19 +100,21 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
 // CHECK-NEXT:       %[[S8:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x12x12x2xf32>) {
 // CHECK-NEXT:         %[[S9:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x12x12x2xf32>) {
 // CHECK-NEXT:           %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x2xf32> to tensor<6x6xf32>
-// CHECK-NEXT:           %[[S10:.*]] = tensor.empty() : tensor<4x6xf32>
-// CHECK-NEXT:           %[[S11:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_9]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S10]] : tensor<4x6xf32>) -> tensor<4x6xf32>
-// CHECK-NEXT:           %[[S12:.*]] = tensor.empty() : tensor<4x4xf32>
-// CHECK-NEXT:           %[[S13:.*]] = linalg.matmul ins(%[[S11]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S12]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK-NEXT:           %[[S11:.*]] = tensor.empty() : tensor<4x6xf32>
+// CHECK-NEXT:           %[[S12:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S11]] : tensor<4x6xf32>) -> tensor<4x6xf32>
+// CHECK-NEXT:           %[[S13:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_9]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S12]] : tensor<4x6xf32>) -> tensor<4x6xf32>
 // CHECK-NEXT:           %[[S14:.*]] = tensor.empty() : tensor<4x4xf32>
-// CHECK-NEXT:           %[[S15:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S14]] : tensor<4x4xf32>) {
+// CHECK-NEXT:           %[[S15:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S14]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK-NEXT:           %[[S16:.*]] = linalg.matmul ins(%[[S13]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S15]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK-NEXT:           %[[S17:.*]] = tensor.empty() : tensor<4x4xf32>
+// CHECK-NEXT:           %[[S18:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S17]] : tensor<4x4xf32>) {
 // CHECK-NEXT:           ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
 // CHECK-NEXT:             linalg.yield %[[IN]] : f32
 // CHECK-NEXT:           } -> tensor<4x4xf32>
-// CHECK-NEXT:           %[[S16:.*]] = linalg.mul ins(%[[S15]], %[[S13]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S14]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK-NEXT:           %[[S17:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
-// CHECK-NEXT:           %[[S18:.*]] = affine.apply #[[$MAP0]](%[[ARG5]])
-// CHECK-NEXT:           %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S16]] into %[[ARG10]][%[[ARG7]], %[[S17]], %[[S18]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x12x12x2xf32>
+// CHECK-NEXT:           %[[S19:.*]] = linalg.mul ins(%[[S18]], %[[S16]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S17]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK-NEXT:           %[[S20:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
+// CHECK-NEXT:           %[[S21:.*]] = affine.apply #[[$MAP0]](%[[ARG5]])
+// CHECK-NEXT:           %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S19]] into %[[ARG10]][%[[ARG7]], %[[S20]], %[[S21]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x12x12x2xf32>
 // CHECK-NEXT:           scf.yield %[[INSERTED_SLICE]] : tensor<2x12x12x2xf32>
 // CHECK-NEXT:         }
 // CHECK-NEXT:         scf.yield %[[S9]] : tensor<2x12x12x2xf32>
diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d.mlir
index ec11a6ef8fbee..0040d81a2d24e 100644
--- a/mlir/test/Dialect/Linalg/winograd-conv2d.mlir
+++ b/mlir/test/Dialect/Linalg/winograd-conv2d.mlir
@@ -7,6 +7,7 @@ func.func @conv2d_4x4_3x3(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x3x3x5xf32>
 
 // CHECK-LABEL: func.func @conv2d_4x4_3x3
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x6x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32> {
+// CHECK-NEXT:  %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:  %[[S2:.*]] = tensor.empty() : tensor<6x6x5x2xf32>
 // CHECK-NEXT:  %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x3x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
 // CHECK-NEXT:  %[[S4:.*]] = tensor.empty() : tensor<6x6x1x1x2x5xf32>
@@ -14,10 +15,11 @@ func.func @conv2d_4x4_3x3(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x3x3x5xf32>
 // CHECK-NEXT:  %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:  %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf32> into tensor<36x2x5xf32>
 // CHECK-NEXT:  %[[S6:.*]] = tensor.empty() : tensor<36x2x2xf32>
-// CHECK-NEXT:  %[[S7:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf32>, tensor<36x5x2xf32>) outs(%[[S6]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
-// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S7]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
-// CHECK-NEXT:  %[[S8:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
-// CHECK-NEXT:  return %[[S8]] : tensor<2x4x4x2xf32>
+// CHECK-NEXT:  %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:  %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf32>, tensor<36x5x2xf32>) outs(%[[S7]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
+// CHECK-NEXT:  %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
+// CHECK-NEXT:  return %[[S9]] : tensor<2x4x4x2xf32>
 // CHECK-NEXT: }
 
 // -----
@@ -29,6 +31,7 @@ func.func @conv2d_2x2_5x5(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x5x5x5xf32>
 
 // CHECK-LABEL: func.func @conv2d_2x2_5x5
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x6x5xf32>, %[[ARG1:.*]]: tensor<2x5x5x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
+// CHECK-NEXT:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<6x6x5x2xf32>
 // CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform m(2) r(5) ins(%[[ARG1]] : tensor<2x5x5x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<6x6x1x1x2x5xf32>
@@ -36,10 +39,11 @@ func.func @conv2d_2x2_5x5(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x5x5x5xf32>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf32> into tensor<36x2x5xf32>
 // CHECK-NEXT:   %[[S6:.*]] = tensor.empty() : tensor<36x2x2xf32>
-// CHECK-NEXT:   %[[S7:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf32>, tensor<36x5x2xf32>) outs(%[[S6]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
-// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S7]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S8:.*]] = linalg.winograd_output_transform m(2) r(5) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32>
-// CHECK-NEXT:   return %[[S8]] : tensor<2x2x2x2xf32>
+// CHECK-NEXT:   %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:   %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf32>, tensor<36x5x2xf32>) outs(%[[S7]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
+// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform m(2) r(5) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32>
+// CHECK-NEXT:   return %[[S9]] : tensor<2x2x2x2xf32>
 // CHECK-NEXT: }
 
 // -----
@@ -51,6 +55,7 @@ func.func @conv2d_1x4_1x3(%arg0: tensor<2x1x6x5xf32>, %arg1: tensor<2x1x3x5xf32>
 
 // CHECK-LABEL: func.func @conv2d_1x4_1x3
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x1x6x5xf32>, %[[ARG1:.*]]: tensor<2x1x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x1x4x2xf32>) -> tensor<2x1x4x2xf32> {
+// CHECK-NEXT:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<1x6x5x2xf32>
 // CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x1x3x5xf32>) outs(%[[S2]] : tensor<1x6x5x2xf32>) -> tensor<1x6x5x2xf32>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<1x6x1x1x2x5xf32>
@@ -58,10 +63,11 @@ func.func @conv2d_1x4_1x3(%arg0: tensor<2x1x6x5xf32>, %arg1: tensor<2x1x3x5xf32>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<1x6x5x2xf32> into tensor<6x5x2xf32>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<1x6x1x1x2x5xf32> into tensor<6x2x5xf32>
 // CHECK-NEXT:   %[[S6:.*]] = tensor.empty() : tensor<6x2x2xf32>
-// CHECK-NEXT:   %[[S7:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S6]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
-// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S7]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [1, 6, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<1x6x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S8:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<1x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x1x4x2xf32>) -> tensor<2x1x4x2xf32>
-// CHECK-NEXT:   return %[[S8]] : tensor<2x1x4x2xf32>
+// CHECK-NEXT:   %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK-NEXT:   %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S7]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [1, 6, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<1x6x1x1x2x2xf32>
+// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<1x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x1x4x2xf32>) -> tensor<2x1x4x2xf32>
+// CHECK-NEXT:   return %[[S9]] : tensor<2x1x4x2xf32>
 // CHECK-NEXT: }
 
 // -----
@@ -73,6 +79,7 @@ func.func @conv2d_4x1_3x1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>
 
 // CHECK-LABEL: func.func @conv2d_4x1_3x1
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x1x5xf32>, %[[ARG1:.*]]: tensor<2x3x1x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> {
+// CHECK-NEXT:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<6x1x5x2xf32>
 // CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x1x5xf32>) outs(%[[S2]] : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32>
@@ -80,10 +87,11 @@ func.func @conv2d_4x1_3x1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x1x5x2xf32> into tensor<6x5x2xf32>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x1x1x1x2x5xf32> into tensor<6x2x5xf32>
 // CHECK-NEXT:   %[[S6:.*]] = tensor.empty() : tensor<6x2x2xf32>
-// CHECK-NEXT:   %[[S7:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S6]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
-// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S7]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S8:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x1x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
-// CHECK-NEXT:   return %[[S8]] : tensor<2x4x1x2xf32>
+// CHECK-NEXT:   %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK-NEXT:   %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S7]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32>
+// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x1x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
+// CHECK-NEXT:   return %[[S9]] : tensor<2x4x1x2xf32>
 // CHECK-NEXT: }
 
 // -----
@@ -95,6 +103,7 @@ func.func @conv2d_aligned(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<2x3x3x5xf3
 
 // CHECK-LABEL: func.func @conv2d_aligned
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> {
+// CHECK-NEXT:  %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:  %[[S2:.*]] = tensor.empty() : tensor<6x6x5x2xf32>
 // CHECK-NEXT:  %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x3x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
 // CHECK-NEXT:  %[[S4:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32>
@@ -102,10 +111,11 @@ func.func @conv2d_aligned(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<2x3x3x5xf3
 // CHECK-NEXT:  %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:  %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x2x2x2x5xf32> into tensor<36x8x5xf32>
 // CHECK-NEXT:  %[[S6:.*]] = tensor.empty() : tensor<36x8x2xf32>
-// CHECK-NEXT:  %[[S7:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x8x5xf32>, tensor<36x5x2xf32>) outs(%[[S6]] : tensor<36x8x2xf32>) -> tensor<36x8x2xf32>
-// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S7]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] : tensor<36x8x2xf32> into tensor<6x6x2x2x2x2xf32>
-// CHECK-NEXT:  %[[S8:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x2x2x2x2xf32>) outs(%[[ARG3]] : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32>
-// CHECK-NEXT:  return %[[S8]] : tensor<2x8x8x2xf32>
+// CHECK-NEXT:  %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<36x8x2xf32>) -> tensor<36x8x2xf32>
+// CHECK-NEXT:  %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x8x5xf32>, tensor<36x5x2xf32>) outs(%[[S7]] : tensor<36x8x2xf32>) -> tensor<36x8x2xf32>
+// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] : tensor<36x8x2xf32> into tensor<6x6x2x2x2x2xf32>
+// CHECK-NEXT:  %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x2x2x2x2xf32>) outs(%[[ARG3]] : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32>
+// CHECK-NEXT:  return %[[S9]] : tensor<2x8x8x2xf32>
 // CHECK-NEXT: }
 
 // -----
@@ -129,14 +139,15 @@ func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5x
 // CHECK-NEXT:  %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:  %[[COLLAPSED_0:.*]] = tensor.collapse_shape %3 {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32>
 // CHECK-NEXT:  %[[S4:.*]] = tensor.empty() : tensor<36x18x2xf32>
-// CHECK-NEXT:  %[[S5:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%[[S4]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
-// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
+// CHECK-NEXT:  %[[S5:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S4]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+// CHECK-NEXT:  %[[S6:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%[[S5]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
 // CHECK-NEXT:  %[[PADDED_1:.*]] = tensor.pad %arg3 low[0, 0, 0, 0] high[0, 3, 3, 0] {
 // CHECK-NEXT:  ^bb0
 // CHECK-NEXT:    tensor.yield %[[CST]] : f32
 // CHECK-NEXT:  } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32>
-// CHECK-NEXT:  %[[S6:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x3x3x2x2xf32>) outs(%[[PADDED_1]] : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
-// CHECK-NEXT:  %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S6]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
+// CHECK-NEXT:  %[[S7:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x3x3x2x2xf32>) outs(%[[PADDED_1]] : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
+// CHECK-NEXT:  %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S7]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
 // CHECK-NEXT:  return %[[EXTRACTED_SLICE]] : tensor<2x9x9x2xf32>
 // CHECK-NEXT: }
 
@@ -149,17 +160,19 @@ func.func @conv2d_type_promotion(%arg0: tensor<2x6x6x5xf16>, %arg1: tensor<2x3x3
 
 // CHECK-LABEL: func.func @conv2d_type_promotion
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x6x5xf16>, %[[ARG1:.*]]: tensor<2x3x3x5xf16>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32> {
-// CHECK:        %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf16>
+// CHECK:        %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-NEXT:   %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf16>
 // CHECK-NEXT:   %[[S1:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x3x5xf16>) outs(%[[S0]] : tensor<6x6x5x2xf16>) -> tensor<6x6x5x2xf16>
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<6x6x1x1x2x5xf16>
 // CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[ARG0]] : tensor<2x6x6x5xf16>) outs(%[[S2]] : tensor<6x6x1x1x2x5xf16>) -> tensor<6x6x1x1x2x5xf16>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf16> into tensor<36x5x2xf16>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf16> into tensor<36x2x5xf16>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<36x2x2xf32>
-// CHECK-NEXT:   %[[S5:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf16>, tensor<36x5x2xf16>) outs(%[[S4]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
-// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S6:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
-// CHECK-NEXT:   return %[[S6]] : tensor<2x4x4x2xf32>
+// CHECK-NEXT:   %[[S5:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S4]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:   %[[S6:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf16>, tensor<36x5x2xf16>) outs(%[[S5]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
+// CHECK-NEXT:   %[[S7:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
+// CHECK-NEXT:   return %[[S7]] : tensor<2x4x4x2xf32>
 // CHECK-NEXT: }
 
 // -----
diff --git a/mlir/test/Target/LLVMIR/Import/nvvmir.ll b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
index e4a8773e2dd80..131e9065b2d88 100644
--- a/mlir/test/Target/LLVMIR/Import/nvvmir.ll
+++ b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
@@ -58,6 +58,9 @@ define i32 @nvvm_special_regs() {
   %27 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank()
   ; CHECK: = nvvm.read.ptx.sreg.cluster.nctarank : i32
   %28 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank()
+
+  ; CHECK = nvvm.read.ptx.sreg.tid.x range <0 : i32, 64 : i32> : i32
+  %29 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   ret i32 %1
 }
 
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 88ffb1c7bfdf7..7fd082a5eb3c7 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -62,7 +62,10 @@ llvm.func @nvvm_special_regs() -> i32 {
   %29 = nvvm.read.ptx.sreg.clock : i32
   // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64
   %30 = nvvm.read.ptx.sreg.clock64 : i64
-  
+
+  // CHECK: %31 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %31 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 64> : i32
+
   llvm.return %1 : i32
 }